use of io.cdap.cdap.etl.proto.v2.DataStreamsConfig in project cdap by caskdata.
the class DataStreamsTest method testTransformComputeWithMacros.
@Test
public void testTransformComputeWithMacros() throws Exception {
Schema schema = Schema.recordOf("test", Schema.Field.of("id", Schema.of(Schema.Type.STRING)), Schema.Field.of("name", Schema.of(Schema.Type.STRING)));
List<StructuredRecord> input = new ArrayList<>();
StructuredRecord samuelRecord = StructuredRecord.builder(schema).set("id", "123").set("name", "samuel").build();
StructuredRecord jacksonRecord = StructuredRecord.builder(schema).set("id", "456").set("name", "jackson").build();
StructuredRecord dwayneRecord = StructuredRecord.builder(schema).set("id", "789").set("name", "dwayne").build();
StructuredRecord johnsonRecord = StructuredRecord.builder(schema).set("id", "0").set("name", "johnson").build();
input.add(samuelRecord);
input.add(jacksonRecord);
input.add(dwayneRecord);
input.add(johnsonRecord);
DataStreamsConfig etlConfig = DataStreamsConfig.builder().addStage(new ETLStage("source", MockSource.getPlugin(schema, input))).addStage(new ETLStage("sink", MockSink.getPlugin("${output}"))).addStage(new ETLStage("filter1", StringValueFilterTransform.getPlugin("${field}", "${val1}"))).addStage(new ETLStage("filter2", StringValueFilterCompute.getPlugin("${field}", "${val2}"))).addStage(new ETLStage("sleep", SleepTransform.getPlugin(2L))).addConnection("source", "sleep").addConnection("sleep", "filter1").addConnection("filter1", "filter2").addConnection("filter2", "sink").setBatchInterval("1s").build();
ApplicationId appId = NamespaceId.DEFAULT.app("simpleApp");
AppRequest<DataStreamsConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
ApplicationManager appManager = deployApplication(appId, appRequest);
final Set<StructuredRecord> expected = new HashSet<>();
expected.add(samuelRecord);
expected.add(jacksonRecord);
testTransformComputeRun(appManager, expected, "dwayne", "johnson", "macroOutput1");
validateMetric(appId, "source.records.out", 4);
validateMetric(appId, "sleep.records.in", 4);
validateMetric(appId, "sleep.records.out", 4);
validateMetric(appId, "filter1.records.in", 4);
validateMetric(appId, "filter1.records.out", 3);
validateMetric(appId, "filter2.records.in", 3);
validateMetric(appId, "filter2.records.out", 2);
validateMetric(appId, "sink.records.in", 2);
Assert.assertTrue(getMetric(appId, "sleep." + co.cask.cdap.etl.common.Constants.Metrics.TOTAL_TIME) > 0L);
expected.clear();
expected.add(dwayneRecord);
expected.add(johnsonRecord);
testTransformComputeRun(appManager, expected, "samuel", "jackson", "macroOutput2");
}
use of io.cdap.cdap.etl.proto.v2.DataStreamsConfig in project cdap by caskdata.
the class Upgrader method convertStreamsConfig.
private DataStreamsConfig convertStreamsConfig(String configStr) {
DataStreamsConfig config = GSON.fromJson(configStr, DataStreamsConfig.class);
DataStreamsConfig.Builder builder = DataStreamsConfig.builder().addConnections(config.getConnections()).setResources(config.getResources()).setDriverResources(config.getDriverResources()).setClientResources(config.getClientResources()).setBatchInterval(config.getBatchInterval()).setCheckpointDir(config.getCheckpointDir()).setNumOfRecordsPreview(config.getNumOfRecordsPreview());
for (ETLStage stage : config.getStages()) {
builder.addStage(stage.upgradeStage(dataStreamsContext));
}
return builder.build();
}
use of io.cdap.cdap.etl.proto.v2.DataStreamsConfig in project cdap by caskdata.
the class DataStreamsTest method testAutoJoinNullEquality.
private void testAutoJoinNullEquality(boolean nullSafe) throws Exception {
/*
* customers ----------|
* |
* |---> join ---> sink
* |
* transactions -------|
*/
Schema inputSchema1 = Schema.recordOf("customer", Schema.Field.of("customer_id", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("customer_name", Schema.nullableOf(Schema.of(Schema.Type.STRING))));
Schema inputSchema2 = Schema.recordOf("transaction", Schema.Field.of("t_id", Schema.of(Schema.Type.STRING)), Schema.Field.of("customer_id", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("item_id", Schema.of(Schema.Type.STRING)));
Schema outSchema = Schema.recordOf("customers.transactions", Schema.Field.of("customers_customer_id", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("customers_customer_name", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("transactions_t_id", Schema.of(Schema.Type.STRING)), Schema.Field.of("transactions_customer_id", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("transactions_item_id", Schema.of(Schema.Type.STRING)));
StructuredRecord recordSamuel = StructuredRecord.builder(inputSchema1).set("customer_id", "1").set("customer_name", "samuel").build();
StructuredRecord recordBob = StructuredRecord.builder(inputSchema1).set("customer_name", "bob").build();
StructuredRecord recordJane = StructuredRecord.builder(inputSchema1).set("customer_id", "3").set("customer_name", "jane").build();
StructuredRecord trans1 = StructuredRecord.builder(inputSchema2).set("t_id", "1").set("customer_id", "1").set("item_id", "11").build();
StructuredRecord trans2 = StructuredRecord.builder(inputSchema2).set("t_id", "2").set("customer_id", "3").set("item_id", "22").build();
StructuredRecord trans3 = StructuredRecord.builder(inputSchema2).set("t_id", "3").set("item_id", "33").build();
List<StructuredRecord> input1 = ImmutableList.of(recordSamuel, recordBob, recordJane);
List<StructuredRecord> input2 = ImmutableList.of(trans1, trans2, trans3);
String outputName = UUID.randomUUID().toString();
DataStreamsConfig etlConfig = DataStreamsConfig.builder().addStage(new ETLStage("customers", MockSource.getPlugin(inputSchema1, input1))).addStage(new ETLStage("transactions", MockSource.getPlugin(inputSchema2, input2))).addStage(new ETLStage("join", MockAutoJoiner.getPlugin(Arrays.asList("customers", "transactions"), Collections.singletonList("customer_id"), Collections.singletonList("transactions"), Collections.emptyList(), Collections.emptyList(), nullSafe))).addStage(new ETLStage("sink", MockSink.getPlugin(outputName))).addConnection("customers", "join").addConnection("transactions", "join").addConnection("join", "sink").setBatchInterval("5s").setCheckpointDir(checkpointDir).build();
AppRequest<DataStreamsConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
ApplicationId appId = NamespaceId.DEFAULT.app(UUID.randomUUID().toString());
ApplicationManager appManager = deployApplication(appId, appRequest);
SparkManager sparkManager = appManager.getSparkManager(DataStreamsSparkLauncher.NAME);
sparkManager.start();
sparkManager.waitForRun(ProgramRunStatus.RUNNING, 10, TimeUnit.SECONDS);
StructuredRecord join1 = StructuredRecord.builder(outSchema).set("customers_customer_id", "1").set("customers_customer_name", "samuel").set("transactions_t_id", "1").set("transactions_customer_id", "1").set("transactions_item_id", "11").build();
StructuredRecord join2 = StructuredRecord.builder(outSchema).set("customers_customer_id", "3").set("customers_customer_name", "jane").set("transactions_t_id", "2").set("transactions_customer_id", "3").set("transactions_item_id", "22").build();
StructuredRecord join3;
if (nullSafe) {
// this transaction has a null customer id, which should match with the null id from customers
join3 = StructuredRecord.builder(outSchema).set("transactions_t_id", "3").set("transactions_item_id", "33").set("customers_customer_name", "bob").build();
} else {
// this transaction has a null customer id, which should not match with the null id from customers
join3 = StructuredRecord.builder(outSchema).set("transactions_t_id", "3").set("transactions_item_id", "33").build();
}
Set<StructuredRecord> expected = ImmutableSet.of(join1, join2, join3);
DataSetManager<Table> outputManager = getDataset(outputName);
Tasks.waitFor(true, () -> {
outputManager.flush();
Set<StructuredRecord> outputRecords = new HashSet<>(MockSink.readOutput(outputManager));
return expected.equals(outputRecords);
}, 4, TimeUnit.MINUTES);
sparkManager.stop();
sparkManager.waitForStopped(10, TimeUnit.SECONDS);
}
use of io.cdap.cdap.etl.proto.v2.DataStreamsConfig in project cdap by caskdata.
the class DataStreamsTest method testAutoJoin.
@Test
public void testAutoJoin() throws Exception {
/*
* customers ----------|
* |
* |---> join ---> sink
* |
* transactions -------|
*/
Schema inputSchema1 = Schema.recordOf("customer", Schema.Field.of("customer_id", Schema.of(Schema.Type.STRING)), Schema.Field.of("customer_name", Schema.of(Schema.Type.STRING)));
Schema inputSchema2 = Schema.recordOf("transaction", Schema.Field.of("t_id", Schema.of(Schema.Type.STRING)), Schema.Field.of("customer_id", Schema.of(Schema.Type.STRING)), Schema.Field.of("item_id", Schema.of(Schema.Type.STRING)));
Schema outSchema = Schema.recordOf("customers.transactions", Schema.Field.of("customers_customer_id", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("customers_customer_name", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("transactions_t_id", Schema.of(Schema.Type.STRING)), Schema.Field.of("transactions_customer_id", Schema.of(Schema.Type.STRING)), Schema.Field.of("transactions_item_id", Schema.of(Schema.Type.STRING)));
StructuredRecord recordSamuel = StructuredRecord.builder(inputSchema1).set("customer_id", "1").set("customer_name", "samuel").build();
StructuredRecord recordBob = StructuredRecord.builder(inputSchema1).set("customer_id", "2").set("customer_name", "bob").build();
StructuredRecord recordJane = StructuredRecord.builder(inputSchema1).set("customer_id", "3").set("customer_name", "jane").build();
StructuredRecord tx1 = StructuredRecord.builder(inputSchema2).set("t_id", "1").set("customer_id", "1").set("item_id", "11").build();
StructuredRecord tx2 = StructuredRecord.builder(inputSchema2).set("t_id", "2").set("customer_id", "3").set("item_id", "22").build();
StructuredRecord tx3 = StructuredRecord.builder(inputSchema2).set("t_id", "3").set("customer_id", "4").set("item_id", "33").build();
List<StructuredRecord> input1 = ImmutableList.of(recordSamuel, recordBob, recordJane);
List<StructuredRecord> input2 = ImmutableList.of(tx1, tx2, tx3);
String outputName = UUID.randomUUID().toString();
DataStreamsConfig etlConfig = DataStreamsConfig.builder().addStage(new ETLStage("customers", MockSource.getPlugin(inputSchema1, input1))).addStage(new ETLStage("transactions", MockSource.getPlugin(inputSchema2, input2))).addStage(new ETLStage("join", MockAutoJoiner.getPlugin(Arrays.asList("customers", "transactions"), Collections.singletonList("customer_id"), Collections.singletonList("transactions"), Collections.emptyList(), Collections.emptyList(), true))).addStage(new ETLStage("sink", MockSink.getPlugin(outputName))).addConnection("customers", "join").addConnection("transactions", "join").addConnection("join", "sink").setBatchInterval("5s").setCheckpointDir(checkpointDir).build();
AppRequest<DataStreamsConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
ApplicationId appId = NamespaceId.DEFAULT.app("AutoJoinerApp");
ApplicationManager appManager = deployApplication(appId, appRequest);
SparkManager sparkManager = appManager.getSparkManager(DataStreamsSparkLauncher.NAME);
sparkManager.start();
sparkManager.waitForRun(ProgramRunStatus.RUNNING, 10, TimeUnit.SECONDS);
StructuredRecord join1 = StructuredRecord.builder(outSchema).set("customers_customer_id", "1").set("customers_customer_name", "samuel").set("transactions_t_id", "1").set("transactions_customer_id", "1").set("transactions_item_id", "11").build();
StructuredRecord join2 = StructuredRecord.builder(outSchema).set("customers_customer_id", "3").set("customers_customer_name", "jane").set("transactions_t_id", "2").set("transactions_customer_id", "3").set("transactions_item_id", "22").build();
StructuredRecord join3 = StructuredRecord.builder(outSchema).set("transactions_t_id", "3").set("transactions_customer_id", "4").set("transactions_item_id", "33").build();
Set<StructuredRecord> expected = ImmutableSet.of(join1, join2, join3);
DataSetManager<Table> outputManager = getDataset(outputName);
Tasks.waitFor(true, () -> {
outputManager.flush();
Set<StructuredRecord> outputRecords = new HashSet<>(MockSink.readOutput(outputManager));
return expected.equals(outputRecords);
}, 4, TimeUnit.MINUTES);
sparkManager.stop();
sparkManager.waitForStopped(10, TimeUnit.SECONDS);
}
use of io.cdap.cdap.etl.proto.v2.DataStreamsConfig in project cdap by caskdata.
the class DataStreamsTest method testAggregatorJoinerMacrosWithCheckpoints.
private void testAggregatorJoinerMacrosWithCheckpoints(boolean isReducibleAggregator) throws Exception {
/*
|--> aggregator --> sink1
users1 --|
|----|
|--> dupeFlagger --> sink2
users2 -------|
*/
Schema userSchema = Schema.recordOf("user", Schema.Field.of("id", Schema.of(Schema.Type.LONG)), Schema.Field.of("name", Schema.of(Schema.Type.STRING)));
List<StructuredRecord> users1 = ImmutableList.of(StructuredRecord.builder(userSchema).set("id", 1L).set("name", "Samuel").build(), StructuredRecord.builder(userSchema).set("id", 2L).set("name", "Dwayne").build(), StructuredRecord.builder(userSchema).set("id", 3L).set("name", "Terry").build());
List<StructuredRecord> users2 = ImmutableList.of(StructuredRecord.builder(userSchema).set("id", 1L).set("name", "Samuel").build(), StructuredRecord.builder(userSchema).set("id", 2L).set("name", "Dwayne").build(), StructuredRecord.builder(userSchema).set("id", 4L).set("name", "Terry").build(), StructuredRecord.builder(userSchema).set("id", 5L).set("name", "Christopher").build());
DataStreamsConfig pipelineConfig = DataStreamsConfig.builder().setBatchInterval("5s").addStage(new ETLStage("users1", MockSource.getPlugin(userSchema, users1))).addStage(new ETLStage("users2", MockSource.getPlugin(userSchema, users2))).addStage(new ETLStage("sink1", MockSink.getPlugin("sink1"))).addStage(new ETLStage("sink2", MockSink.getPlugin("sink2"))).addStage(new ETLStage("aggregator", isReducibleAggregator ? FieldCountReducibleAggregator.getPlugin("${aggfield}", "${aggType}") : FieldCountAggregator.getPlugin("${aggfield}", "${aggType}"))).addStage(new ETLStage("dupeFlagger", DupeFlagger.getPlugin("users1", "${flagField}"))).addConnection("users1", "aggregator").addConnection("aggregator", "sink1").addConnection("users1", "dupeFlagger").addConnection("users2", "dupeFlagger").addConnection("dupeFlagger", "sink2").setCheckpointDir(checkpointDir).build();
AppRequest<DataStreamsConfig> appRequest = new AppRequest<>(APP_ARTIFACT, pipelineConfig);
ApplicationId appId = NamespaceId.DEFAULT.app("ParallelAggJoinApp" + isReducibleAggregator);
ApplicationManager appManager = deployApplication(appId, appRequest);
// run it once with this set of macros
Map<String, String> arguments = new HashMap<>();
arguments.put("aggfield", "id");
arguments.put("aggType", "long");
arguments.put("flagField", "isDupe");
SparkManager sparkManager = appManager.getSparkManager(DataStreamsSparkLauncher.NAME);
sparkManager.start(arguments);
sparkManager.waitForRun(ProgramRunStatus.RUNNING, 10, TimeUnit.SECONDS);
DataSetManager<Table> sink1 = getDataset("sink1");
DataSetManager<Table> sink2 = getDataset("sink2");
Schema aggSchema = Schema.recordOf("id.count", Schema.Field.of("id", Schema.of(Schema.Type.LONG)), Schema.Field.of("ct", Schema.of(Schema.Type.LONG)));
Set<StructuredRecord> expectedAggregates = ImmutableSet.of(StructuredRecord.builder(aggSchema).set("id", 0L).set("ct", 3L).build(), StructuredRecord.builder(aggSchema).set("id", 1L).set("ct", 1L).build(), StructuredRecord.builder(aggSchema).set("id", 2L).set("ct", 1L).build(), StructuredRecord.builder(aggSchema).set("id", 3L).set("ct", 1L).build());
Schema outputSchema = Schema.recordOf("user.flagged", Schema.Field.of("id", Schema.of(Schema.Type.LONG)), Schema.Field.of("name", Schema.of(Schema.Type.STRING)), Schema.Field.of("isDupe", Schema.of(Schema.Type.BOOLEAN)));
Set<StructuredRecord> expectedJoined = ImmutableSet.of(StructuredRecord.builder(outputSchema).set("id", 1L).set("name", "Samuel").set("isDupe", true).build(), StructuredRecord.builder(outputSchema).set("id", 2L).set("name", "Dwayne").set("isDupe", true).build(), StructuredRecord.builder(outputSchema).set("id", 3L).set("name", "Terry").set("isDupe", false).build());
Tasks.waitFor(true, () -> {
sink1.flush();
sink2.flush();
Set<StructuredRecord> actualAggs = new HashSet<>(MockSink.readOutput(sink1));
Set<StructuredRecord> actualJoined = new HashSet<>(MockSink.readOutput(sink2));
return expectedAggregates.equals(actualAggs) && expectedJoined.equals(actualJoined);
}, 1, TimeUnit.MINUTES);
sparkManager.stop();
sparkManager.waitForStopped(30, TimeUnit.SECONDS);
MockSink.clear(sink1);
MockSink.clear(sink2);
// run it again with different macros to make sure they are re-evaluated and not stored in the checkpoint
arguments = new HashMap<>();
arguments.put("aggfield", "name");
arguments.put("aggType", "string");
arguments.put("flagField", "dupe");
sparkManager.start(arguments);
sparkManager.waitForRun(ProgramRunStatus.RUNNING, 10, TimeUnit.SECONDS);
aggSchema = Schema.recordOf("name.count", Schema.Field.of("name", Schema.of(Schema.Type.STRING)), Schema.Field.of("ct", Schema.of(Schema.Type.LONG)));
Set<StructuredRecord> expectedAggregates2 = ImmutableSet.of(StructuredRecord.builder(aggSchema).set("name", "all").set("ct", 3L).build(), StructuredRecord.builder(aggSchema).set("name", "Samuel").set("ct", 1L).build(), StructuredRecord.builder(aggSchema).set("name", "Dwayne").set("ct", 1L).build(), StructuredRecord.builder(aggSchema).set("name", "Terry").set("ct", 1L).build());
outputSchema = Schema.recordOf("user.flagged", Schema.Field.of("id", Schema.of(Schema.Type.LONG)), Schema.Field.of("name", Schema.of(Schema.Type.STRING)), Schema.Field.of("dupe", Schema.of(Schema.Type.BOOLEAN)));
Set<StructuredRecord> expectedJoined2 = ImmutableSet.of(StructuredRecord.builder(outputSchema).set("id", 1L).set("name", "Samuel").set("dupe", true).build(), StructuredRecord.builder(outputSchema).set("id", 2L).set("name", "Dwayne").set("dupe", true).build(), StructuredRecord.builder(outputSchema).set("id", 3L).set("name", "Terry").set("dupe", false).build());
Tasks.waitFor(true, () -> {
sink1.flush();
sink2.flush();
Set<StructuredRecord> actualAggs = new HashSet<>(MockSink.readOutput(sink1));
Set<StructuredRecord> actualJoined = new HashSet<>(MockSink.readOutput(sink2));
return expectedAggregates2.equals(actualAggs) && expectedJoined2.equals(actualJoined);
}, 1, TimeUnit.MINUTES);
sparkManager.stop();
MockSink.clear(sink1);
MockSink.clear(sink2);
}
Aggregations