use of io.cdap.cdap.etl.proto.v2.DataStreamsConfig in project cdap by cdapio.
the class DataStreamsTest method testLineageWithMacros.
@Test
public void testLineageWithMacros() throws Exception {
Schema schema = Schema.recordOf("test", Schema.Field.of("key", Schema.of(Schema.Type.STRING)), Schema.Field.of("value", Schema.of(Schema.Type.STRING)));
List<StructuredRecord> input = ImmutableList.of(StructuredRecord.builder(schema).set("key", "key1").set("value", "value1").build(), StructuredRecord.builder(schema).set("key", "key2").set("value", "value2").build());
String srcName = "lineageSource";
String sinkName1 = "lineageOutput1";
String sinkName2 = "lineageOutput2";
DataStreamsConfig etlConfig = DataStreamsConfig.builder().addStage(new ETLStage("source", MockSource.getPlugin(schema, input, 0L, srcName))).addStage(new ETLStage("sink", MockSink.getPlugin("${output}"))).addStage(new ETLStage("identity", IdentityTransform.getPlugin())).addConnection("source", "identity").addConnection("identity", "sink").setCheckpointDir(checkpointDir).setBatchInterval("1s").build();
ApplicationId appId = NamespaceId.DEFAULT.app("lineageApp");
AppRequest<DataStreamsConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
ApplicationManager appManager = deployApplication(appId, appRequest);
ProgramId spark = appId.spark(DataStreamsSparkLauncher.NAME);
RunId runId = testLineageWithMacro(appManager, new HashSet<>(input), sinkName1);
FieldLineageAdmin fieldAdmin = getFieldLineageAdmin();
LineageAdmin lineageAdmin = getLineageAdmin();
// wait for the lineage get populated
Tasks.waitFor(true, () -> {
Lineage dsLineage = lineageAdmin.computeLineage(NamespaceId.DEFAULT.dataset(srcName), 0, System.currentTimeMillis(), 1, "workflow");
DatasetFieldLineageSummary fll = fieldAdmin.getDatasetFieldLineage(Constants.FieldLineage.Direction.BOTH, EndPoint.of("default", srcName), 0, System.currentTimeMillis());
return dsLineage.getRelations().size() == 2 && !fll.getOutgoing().isEmpty();
}, 10, TimeUnit.SECONDS);
Lineage lineage = lineageAdmin.computeLineage(NamespaceId.DEFAULT.dataset(srcName), 0, System.currentTimeMillis(), 1, "workflow");
Set<Relation> expectedLineage = ImmutableSet.of(new Relation(NamespaceId.DEFAULT.dataset(srcName), spark, AccessType.READ, runId), new Relation(NamespaceId.DEFAULT.dataset(sinkName1), spark, AccessType.WRITE, runId));
Assert.assertEquals(expectedLineage, lineage.getRelations());
DatasetFieldLineageSummary summary = fieldAdmin.getDatasetFieldLineage(Constants.FieldLineage.Direction.BOTH, EndPoint.of("default", srcName), 0, System.currentTimeMillis());
Assert.assertEquals(NamespaceId.DEFAULT.dataset(srcName), summary.getDatasetId());
Assert.assertEquals(ImmutableSet.of("key", "value"), summary.getFields());
Assert.assertTrue(summary.getIncoming().isEmpty());
Set<DatasetFieldLineageSummary.FieldLineageRelations> outgoing = summary.getOutgoing();
Assert.assertEquals(1, outgoing.size());
Set<DatasetFieldLineageSummary.FieldLineageRelations> expectedRelations = Collections.singleton(new DatasetFieldLineageSummary.FieldLineageRelations(NamespaceId.DEFAULT.dataset(sinkName1), 2, ImmutableSet.of(new FieldRelation("key", "key"), new FieldRelation("value", "value"))));
Assert.assertEquals(expectedRelations, outgoing);
// here sleep for 1 seconds to start the second run because the dataset lineage is storing based on unit second
TimeUnit.SECONDS.sleep(1);
long startTimeMillis = System.currentTimeMillis();
runId = testLineageWithMacro(appManager, new HashSet<>(input), sinkName2);
// wait for the lineage get populated
Tasks.waitFor(true, () -> {
Lineage dsLineage = lineageAdmin.computeLineage(NamespaceId.DEFAULT.dataset(srcName), startTimeMillis, System.currentTimeMillis(), 1, "workflow");
long end = System.currentTimeMillis();
DatasetFieldLineageSummary fll = fieldAdmin.getDatasetFieldLineage(Constants.FieldLineage.Direction.BOTH, EndPoint.of("default", srcName), startTimeMillis, end);
return dsLineage.getRelations().size() == 2 && !fll.getOutgoing().isEmpty();
}, 10, TimeUnit.SECONDS);
lineage = lineageAdmin.computeLineage(NamespaceId.DEFAULT.dataset(srcName), startTimeMillis, System.currentTimeMillis(), 1, "workflow");
expectedLineage = ImmutableSet.of(new Relation(NamespaceId.DEFAULT.dataset(srcName), spark, AccessType.READ, runId), new Relation(NamespaceId.DEFAULT.dataset(sinkName2), spark, AccessType.WRITE, runId));
Assert.assertEquals(expectedLineage, lineage.getRelations());
summary = fieldAdmin.getDatasetFieldLineage(Constants.FieldLineage.Direction.BOTH, EndPoint.of("default", srcName), startTimeMillis, System.currentTimeMillis());
Assert.assertEquals(NamespaceId.DEFAULT.dataset(srcName), summary.getDatasetId());
Assert.assertEquals(ImmutableSet.of("key", "value"), summary.getFields());
Assert.assertTrue(summary.getIncoming().isEmpty());
outgoing = summary.getOutgoing();
Assert.assertEquals(1, outgoing.size());
expectedRelations = Collections.singleton(new DatasetFieldLineageSummary.FieldLineageRelations(NamespaceId.DEFAULT.dataset(sinkName2), 2, ImmutableSet.of(new FieldRelation("key", "key"), new FieldRelation("value", "value"))));
Assert.assertEquals(expectedRelations, outgoing);
}
use of io.cdap.cdap.etl.proto.v2.DataStreamsConfig in project cdap by cdapio.
the class DataStreamsTest method testJoin.
@Test
public void testJoin() throws Exception {
/*
* source1 ----> t1 ------
* | --> innerjoin ----> t4 ------
* source2 ----> t2 ------ |
* | ---> outerjoin --> sink1
* |
* source3 -------------------- t3 ------------------------
*/
Schema inputSchema1 = Schema.recordOf("customerRecord", Schema.Field.of("customer_id", Schema.of(Schema.Type.STRING)), Schema.Field.of("customer_name", Schema.of(Schema.Type.STRING)));
Schema inputSchema2 = Schema.recordOf("itemRecord", Schema.Field.of("item_id", Schema.of(Schema.Type.STRING)), Schema.Field.of("item_price", Schema.of(Schema.Type.LONG)), Schema.Field.of("cust_id", Schema.of(Schema.Type.STRING)), Schema.Field.of("cust_name", Schema.of(Schema.Type.STRING)));
Schema inputSchema3 = Schema.recordOf("transactionRecord", Schema.Field.of("t_id", Schema.of(Schema.Type.STRING)), Schema.Field.of("c_id", Schema.of(Schema.Type.STRING)), Schema.Field.of("i_id", Schema.of(Schema.Type.STRING)));
Schema outSchema2 = Schema.recordOf("join.output", Schema.Field.of("t_id", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("c_id", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("i_id", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("customer_id", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("customer_name", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("item_id", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("item_price", Schema.nullableOf(Schema.of(Schema.Type.LONG))), Schema.Field.of("cust_id", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("cust_name", Schema.nullableOf(Schema.of(Schema.Type.STRING))));
StructuredRecord recordSamuel = StructuredRecord.builder(inputSchema1).set("customer_id", "1").set("customer_name", "samuel").build();
StructuredRecord recordBob = StructuredRecord.builder(inputSchema1).set("customer_id", "2").set("customer_name", "bob").build();
StructuredRecord recordJane = StructuredRecord.builder(inputSchema1).set("customer_id", "3").set("customer_name", "jane").build();
StructuredRecord recordCar = StructuredRecord.builder(inputSchema2).set("item_id", "11").set("item_price", 10000L).set("cust_id", "1").set("cust_name", "samuel").build();
StructuredRecord recordBike = StructuredRecord.builder(inputSchema2).set("item_id", "22").set("item_price", 100L).set("cust_id", "3").set("cust_name", "jane").build();
StructuredRecord recordTrasCar = StructuredRecord.builder(inputSchema3).set("t_id", "1").set("c_id", "1").set("i_id", "11").build();
StructuredRecord recordTrasBike = StructuredRecord.builder(inputSchema3).set("t_id", "2").set("c_id", "3").set("i_id", "22").build();
StructuredRecord recordTrasPlane = StructuredRecord.builder(inputSchema3).set("t_id", "3").set("c_id", "4").set("i_id", "33").build();
List<StructuredRecord> input1 = ImmutableList.of(recordSamuel, recordBob, recordJane);
List<StructuredRecord> input2 = ImmutableList.of(recordCar, recordBike);
List<StructuredRecord> input3 = ImmutableList.of(recordTrasCar, recordTrasBike, recordTrasPlane);
String outputName = "multiJoinOutputSink";
DataStreamsConfig etlConfig = DataStreamsConfig.builder().addStage(new ETLStage("source1", MockSource.getPlugin(inputSchema1, input1))).addStage(new ETLStage("source2", MockSource.getPlugin(inputSchema2, input2))).addStage(new ETLStage("source3", MockSource.getPlugin(inputSchema3, input3))).addStage(new ETLStage("t1", IdentityTransform.getPlugin())).addStage(new ETLStage("t2", IdentityTransform.getPlugin())).addStage(new ETLStage("t3", IdentityTransform.getPlugin())).addStage(new ETLStage("t4", IdentityTransform.getPlugin())).addStage(new ETLStage("innerjoin", MockJoiner.getPlugin("t1.customer_id=t2.cust_id", "t1,t2", ""))).addStage(new ETLStage("outerjoin", MockJoiner.getPlugin("t4.item_id=t3.i_id", "", ""))).addStage(new ETLStage("multijoinSink", MockSink.getPlugin(outputName))).addConnection("source1", "t1").addConnection("source2", "t2").addConnection("source3", "t3").addConnection("t1", "innerjoin").addConnection("t2", "innerjoin").addConnection("innerjoin", "t4").addConnection("t3", "outerjoin").addConnection("t4", "outerjoin").addConnection("outerjoin", "multijoinSink").setBatchInterval("5s").setCheckpointDir(checkpointDir).build();
AppRequest<DataStreamsConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
ApplicationId appId = NamespaceId.DEFAULT.app("JoinerApp");
ApplicationManager appManager = deployApplication(appId, appRequest);
SparkManager sparkManager = appManager.getSparkManager(DataStreamsSparkLauncher.NAME);
sparkManager.start();
sparkManager.waitForRun(ProgramRunStatus.RUNNING, 10, TimeUnit.SECONDS);
StructuredRecord joinRecordSamuel = StructuredRecord.builder(outSchema2).set("customer_id", "1").set("customer_name", "samuel").set("item_id", "11").set("item_price", 10000L).set("cust_id", "1").set("cust_name", "samuel").set("t_id", "1").set("c_id", "1").set("i_id", "11").build();
StructuredRecord joinRecordJane = StructuredRecord.builder(outSchema2).set("customer_id", "3").set("customer_name", "jane").set("item_id", "22").set("item_price", 100L).set("cust_id", "3").set("cust_name", "jane").set("t_id", "2").set("c_id", "3").set("i_id", "22").build();
StructuredRecord joinRecordPlane = StructuredRecord.builder(outSchema2).set("t_id", "3").set("c_id", "4").set("i_id", "33").build();
Set<StructuredRecord> expected = ImmutableSet.of(joinRecordSamuel, joinRecordJane, joinRecordPlane);
DataSetManager<Table> outputManager = getDataset(outputName);
Tasks.waitFor(true, () -> {
outputManager.flush();
Set<StructuredRecord> outputRecords = new HashSet<>(MockSink.readOutput(outputManager));
return expected.equals(outputRecords);
}, 4, TimeUnit.MINUTES);
sparkManager.stop();
sparkManager.waitForStopped(10, TimeUnit.SECONDS);
validateMetric(appId, "source1.records.out", 3);
validateMetric(appId, "source2.records.out", 2);
validateMetric(appId, "source3.records.out", 3);
validateMetric(appId, "t1.records.in", 3);
validateMetric(appId, "t1.records.out", 3);
validateMetric(appId, "t2.records.in", 2);
validateMetric(appId, "t2.records.out", 2);
validateMetric(appId, "t3.records.in", 3);
validateMetric(appId, "t3.records.out", 3);
validateMetric(appId, "t4.records.in", 2);
validateMetric(appId, "t4.records.out", 2);
validateMetric(appId, "innerjoin.records.in", 5);
validateMetric(appId, "innerjoin.records.out", 2);
validateMetric(appId, "outerjoin.records.in", 5);
validateMetric(appId, "outerjoin.records.out", 3);
validateMetric(appId, "multijoinSink.records.in", 3);
}
use of io.cdap.cdap.etl.proto.v2.DataStreamsConfig in project cdap by cdapio.
the class DataStreamsTest method testSplitterTransform.
@Test
public void testSplitterTransform() throws Exception {
Schema schema = Schema.recordOf("user", Schema.Field.of("id", Schema.of(Schema.Type.LONG)), Schema.Field.of("name", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("email", Schema.nullableOf(Schema.of(Schema.Type.STRING))));
StructuredRecord user0 = StructuredRecord.builder(schema).set("id", 0L).build();
StructuredRecord user1 = StructuredRecord.builder(schema).set("id", 1L).set("email", "one@example.com").build();
StructuredRecord user2 = StructuredRecord.builder(schema).set("id", 2L).set("name", "two").build();
StructuredRecord user3 = StructuredRecord.builder(schema).set("id", 3L).set("name", "three").set("email", "three@example.com").build();
File outputDir = TMP_FOLDER.newFolder();
String output1 = new File(outputDir, "output1").getAbsolutePath();
String output2 = new File(outputDir, "output2").getAbsolutePath();
/*
*
* |null --> sink1
* |null--> splitter2 --|
* source --> splitter1--| |non-null --|
* | |--> sink2
* |non-null------------------------|
*/
DataStreamsConfig config = DataStreamsConfig.builder().setBatchInterval("5s").addStage(new ETLStage("source", MockSource.getPlugin(schema, ImmutableList.of(user0, user1, user2, user3)))).addStage(new ETLStage("splitter1", NullFieldSplitterTransform.getPlugin("name"))).addStage(new ETLStage("splitter2", NullFieldSplitterTransform.getPlugin("email"))).addStage(new ETLStage("sink1", MockExternalSink.getPlugin(UUID.randomUUID().toString(), "sink1", output1))).addStage(new ETLStage("sink2", MockExternalSink.getPlugin(UUID.randomUUID().toString(), "sink2", output2))).addConnection("source", "splitter1").addConnection("splitter1", "splitter2", "null").addConnection("splitter1", "sink2", "non-null").addConnection("splitter2", "sink1", "null").addConnection("splitter2", "sink2", "non-null").setCheckpointDir(checkpointDir).build();
AppRequest<DataStreamsConfig> appRequest = new AppRequest<>(APP_ARTIFACT, config);
ApplicationId appId = NamespaceId.DEFAULT.app("SplitterTest");
ApplicationManager appManager = deployApplication(appId, appRequest);
// run pipeline
Map<String, String> args = Collections.singletonMap(io.cdap.cdap.etl.common.Constants.CONSOLIDATE_STAGES, "true");
SparkManager sparkManager = appManager.getSparkManager(DataStreamsSparkLauncher.NAME);
sparkManager.startAndWaitForGoodRun(args, ProgramRunStatus.RUNNING, 10, TimeUnit.SECONDS);
// check output
// sink1 should only have records where both name and email are null (user0)
Set<StructuredRecord> expected1 = ImmutableSet.of(user0);
Tasks.waitFor(true, () -> {
Set<StructuredRecord> outputRecords = new HashSet<>(MockExternalSink.readOutput(output1, schema));
return expected1.equals(outputRecords);
}, 4, TimeUnit.MINUTES);
// sink2 should have anything with a non-null name or non-null email
Set<StructuredRecord> expected2 = ImmutableSet.of(user1, user2, user3);
Tasks.waitFor(true, () -> {
Set<StructuredRecord> outputRecords = new HashSet<>(MockExternalSink.readOutput(output2, schema));
return expected2.equals(outputRecords);
}, 4, TimeUnit.MINUTES);
sparkManager.stop();
sparkManager.waitForStopped(10, TimeUnit.SECONDS);
validateMetric(appId, "source.records.out", 4);
validateMetric(appId, "splitter1.records.in", 4);
validateMetric(appId, "splitter1.records.out.non-null", 2);
validateMetric(appId, "splitter1.records.out.null", 2);
validateMetric(appId, "splitter2.records.in", 2);
validateMetric(appId, "splitter2.records.out.non-null", 1);
validateMetric(appId, "splitter2.records.out.null", 1);
validateMetric(appId, "sink1.records.in", 1);
validateMetric(appId, "sink2.records.in", 3);
}
use of io.cdap.cdap.etl.proto.v2.DataStreamsConfig in project cdap by cdapio.
the class DataStreamsTest method testParallelAggregators.
private void testParallelAggregators(boolean isReducibleAggregator) throws Exception {
String sink1Name = "pAggOutput1-" + isReducibleAggregator;
String sink2Name = "pAggOutput2-" + isReducibleAggregator;
Schema inputSchema = Schema.recordOf("testRecord", Schema.Field.of("user", Schema.of(Schema.Type.STRING)), Schema.Field.of("item", Schema.of(Schema.Type.LONG)));
List<StructuredRecord> input1 = ImmutableList.of(StructuredRecord.builder(inputSchema).set("user", "samuel").set("item", 1L).build(), StructuredRecord.builder(inputSchema).set("user", "samuel").set("item", 2L).build());
List<StructuredRecord> input2 = ImmutableList.of(StructuredRecord.builder(inputSchema).set("user", "samuel").set("item", 3L).build(), StructuredRecord.builder(inputSchema).set("user", "john").set("item", 4L).build(), StructuredRecord.builder(inputSchema).set("user", "john").set("item", 3L).build());
/*
source1 --|--> agg1 --> sink1
|
source2 --|--> agg2 --> sink2
*/
DataStreamsConfig pipelineConfig = DataStreamsConfig.builder().setBatchInterval("5s").addStage(new ETLStage("source1", MockSource.getPlugin(inputSchema, input1))).addStage(new ETLStage("source2", MockSource.getPlugin(inputSchema, input2))).addStage(new ETLStage("sink1", MockSink.getPlugin(sink1Name))).addStage(new ETLStage("sink2", MockSink.getPlugin(sink2Name))).addStage(new ETLStage("agg1", isReducibleAggregator ? FieldCountReducibleAggregator.getPlugin("user", "string") : FieldCountAggregator.getPlugin("user", "string"))).addStage(new ETLStage("agg2", isReducibleAggregator ? FieldCountReducibleAggregator.getPlugin("item", "long") : FieldCountAggregator.getPlugin("item", "long"))).addConnection("source1", "agg1").addConnection("source1", "agg2").addConnection("source2", "agg1").addConnection("source2", "agg2").addConnection("agg1", "sink1").addConnection("agg2", "sink2").setCheckpointDir(checkpointDir).disableCheckpoints().build();
AppRequest<DataStreamsConfig> appRequest = new AppRequest<>(APP_ARTIFACT, pipelineConfig);
ApplicationId appId = NamespaceId.DEFAULT.app("ParallelAggApp" + isReducibleAggregator);
ApplicationManager appManager = deployApplication(appId, appRequest);
SparkManager sparkManager = appManager.getSparkManager(DataStreamsSparkLauncher.NAME);
sparkManager.start();
sparkManager.waitForRun(ProgramRunStatus.RUNNING, 10, TimeUnit.SECONDS);
Schema outputSchema1 = Schema.recordOf("user.count", Schema.Field.of("user", Schema.of(Schema.Type.STRING)), Schema.Field.of("ct", Schema.of(Schema.Type.LONG)));
Schema outputSchema2 = Schema.recordOf("item.count", Schema.Field.of("item", Schema.of(Schema.Type.LONG)), Schema.Field.of("ct", Schema.of(Schema.Type.LONG)));
// check output
DataSetManager<Table> sinkManager1 = getDataset(sink1Name);
Set<StructuredRecord> expected1 = ImmutableSet.of(StructuredRecord.builder(outputSchema1).set("user", "all").set("ct", 5L).build(), StructuredRecord.builder(outputSchema1).set("user", "samuel").set("ct", 3L).build(), StructuredRecord.builder(outputSchema1).set("user", "john").set("ct", 2L).build());
Tasks.waitFor(true, () -> {
sinkManager1.flush();
Set<StructuredRecord> outputRecords = new HashSet<>(MockSink.readOutput(sinkManager1));
return expected1.equals(outputRecords);
}, 1, TimeUnit.MINUTES);
DataSetManager<Table> sinkManager2 = getDataset(sink2Name);
Set<StructuredRecord> expected2 = ImmutableSet.of(StructuredRecord.builder(outputSchema2).set("item", 0L).set("ct", 5L).build(), StructuredRecord.builder(outputSchema2).set("item", 1L).set("ct", 1L).build(), StructuredRecord.builder(outputSchema2).set("item", 2L).set("ct", 1L).build(), StructuredRecord.builder(outputSchema2).set("item", 3L).set("ct", 2L).build(), StructuredRecord.builder(outputSchema2).set("item", 4L).set("ct", 1L).build());
Tasks.waitFor(true, () -> {
sinkManager2.flush();
Set<StructuredRecord> outputRecords = new HashSet<>(MockSink.readOutput(sinkManager2));
return expected2.equals(outputRecords);
}, 1, TimeUnit.MINUTES);
sparkManager.stop();
sparkManager.waitForStopped(10, TimeUnit.SECONDS);
validateMetric(appId, "source1.records.out", 2);
validateMetric(appId, "source2.records.out", 3);
validateMetric(appId, "agg1.records.in", 5);
validateMetric(appId, "agg1.records.out", 3);
validateMetric(appId, "agg2.records.in", 5);
validateMetric(appId, "agg2.records.out", 5);
validateMetric(appId, "sink1.records.in", 3);
validateMetric(appId, "sink2.records.in", 5);
}
use of io.cdap.cdap.etl.proto.v2.DataStreamsConfig in project cdap by cdapio.
the class DataStreamsTest method testAggregatorJoinerMacrosWithCheckpoints.
private void testAggregatorJoinerMacrosWithCheckpoints(boolean isReducibleAggregator) throws Exception {
/*
|--> aggregator --> sink1
users1 --|
|----|
|--> dupeFlagger --> sink2
users2 -------|
*/
Schema userSchema = Schema.recordOf("user", Schema.Field.of("id", Schema.of(Schema.Type.LONG)), Schema.Field.of("name", Schema.of(Schema.Type.STRING)));
List<StructuredRecord> users1 = ImmutableList.of(StructuredRecord.builder(userSchema).set("id", 1L).set("name", "Samuel").build(), StructuredRecord.builder(userSchema).set("id", 2L).set("name", "Dwayne").build(), StructuredRecord.builder(userSchema).set("id", 3L).set("name", "Terry").build());
List<StructuredRecord> users2 = ImmutableList.of(StructuredRecord.builder(userSchema).set("id", 1L).set("name", "Samuel").build(), StructuredRecord.builder(userSchema).set("id", 2L).set("name", "Dwayne").build(), StructuredRecord.builder(userSchema).set("id", 4L).set("name", "Terry").build(), StructuredRecord.builder(userSchema).set("id", 5L).set("name", "Christopher").build());
DataStreamsConfig pipelineConfig = DataStreamsConfig.builder().setBatchInterval("5s").addStage(new ETLStage("users1", MockSource.getPlugin(userSchema, users1))).addStage(new ETLStage("users2", MockSource.getPlugin(userSchema, users2))).addStage(new ETLStage("sink1", MockSink.getPlugin("sink1"))).addStage(new ETLStage("sink2", MockSink.getPlugin("sink2"))).addStage(new ETLStage("aggregator", isReducibleAggregator ? FieldCountReducibleAggregator.getPlugin("${aggfield}", "${aggType}") : FieldCountAggregator.getPlugin("${aggfield}", "${aggType}"))).addStage(new ETLStage("dupeFlagger", DupeFlagger.getPlugin("users1", "${flagField}"))).addConnection("users1", "aggregator").addConnection("aggregator", "sink1").addConnection("users1", "dupeFlagger").addConnection("users2", "dupeFlagger").addConnection("dupeFlagger", "sink2").setCheckpointDir(checkpointDir).build();
AppRequest<DataStreamsConfig> appRequest = new AppRequest<>(APP_ARTIFACT, pipelineConfig);
ApplicationId appId = NamespaceId.DEFAULT.app("ParallelAggJoinApp" + isReducibleAggregator);
ApplicationManager appManager = deployApplication(appId, appRequest);
// run it once with this set of macros
Map<String, String> arguments = new HashMap<>();
arguments.put("aggfield", "id");
arguments.put("aggType", "long");
arguments.put("flagField", "isDupe");
SparkManager sparkManager = appManager.getSparkManager(DataStreamsSparkLauncher.NAME);
sparkManager.start(arguments);
sparkManager.waitForRun(ProgramRunStatus.RUNNING, 10, TimeUnit.SECONDS);
DataSetManager<Table> sink1 = getDataset("sink1");
DataSetManager<Table> sink2 = getDataset("sink2");
Schema aggSchema = Schema.recordOf("id.count", Schema.Field.of("id", Schema.of(Schema.Type.LONG)), Schema.Field.of("ct", Schema.of(Schema.Type.LONG)));
Set<StructuredRecord> expectedAggregates = ImmutableSet.of(StructuredRecord.builder(aggSchema).set("id", 0L).set("ct", 3L).build(), StructuredRecord.builder(aggSchema).set("id", 1L).set("ct", 1L).build(), StructuredRecord.builder(aggSchema).set("id", 2L).set("ct", 1L).build(), StructuredRecord.builder(aggSchema).set("id", 3L).set("ct", 1L).build());
Schema outputSchema = Schema.recordOf("user.flagged", Schema.Field.of("id", Schema.of(Schema.Type.LONG)), Schema.Field.of("name", Schema.of(Schema.Type.STRING)), Schema.Field.of("isDupe", Schema.of(Schema.Type.BOOLEAN)));
Set<StructuredRecord> expectedJoined = ImmutableSet.of(StructuredRecord.builder(outputSchema).set("id", 1L).set("name", "Samuel").set("isDupe", true).build(), StructuredRecord.builder(outputSchema).set("id", 2L).set("name", "Dwayne").set("isDupe", true).build(), StructuredRecord.builder(outputSchema).set("id", 3L).set("name", "Terry").set("isDupe", false).build());
Tasks.waitFor(true, () -> {
sink1.flush();
sink2.flush();
Set<StructuredRecord> actualAggs = new HashSet<>(MockSink.readOutput(sink1));
Set<StructuredRecord> actualJoined = new HashSet<>(MockSink.readOutput(sink2));
return expectedAggregates.equals(actualAggs) && expectedJoined.equals(actualJoined);
}, 1, TimeUnit.MINUTES);
sparkManager.stop();
sparkManager.waitForStopped(30, TimeUnit.SECONDS);
MockSink.clear(sink1);
MockSink.clear(sink2);
// run it again with different macros to make sure they are re-evaluated and not stored in the checkpoint
arguments = new HashMap<>();
arguments.put("aggfield", "name");
arguments.put("aggType", "string");
arguments.put("flagField", "dupe");
sparkManager.start(arguments);
sparkManager.waitForRun(ProgramRunStatus.RUNNING, 10, TimeUnit.SECONDS);
aggSchema = Schema.recordOf("name.count", Schema.Field.of("name", Schema.of(Schema.Type.STRING)), Schema.Field.of("ct", Schema.of(Schema.Type.LONG)));
Set<StructuredRecord> expectedAggregates2 = ImmutableSet.of(StructuredRecord.builder(aggSchema).set("name", "all").set("ct", 3L).build(), StructuredRecord.builder(aggSchema).set("name", "Samuel").set("ct", 1L).build(), StructuredRecord.builder(aggSchema).set("name", "Dwayne").set("ct", 1L).build(), StructuredRecord.builder(aggSchema).set("name", "Terry").set("ct", 1L).build());
outputSchema = Schema.recordOf("user.flagged", Schema.Field.of("id", Schema.of(Schema.Type.LONG)), Schema.Field.of("name", Schema.of(Schema.Type.STRING)), Schema.Field.of("dupe", Schema.of(Schema.Type.BOOLEAN)));
Set<StructuredRecord> expectedJoined2 = ImmutableSet.of(StructuredRecord.builder(outputSchema).set("id", 1L).set("name", "Samuel").set("dupe", true).build(), StructuredRecord.builder(outputSchema).set("id", 2L).set("name", "Dwayne").set("dupe", true).build(), StructuredRecord.builder(outputSchema).set("id", 3L).set("name", "Terry").set("dupe", false).build());
Tasks.waitFor(true, () -> {
sink1.flush();
sink2.flush();
Set<StructuredRecord> actualAggs = new HashSet<>(MockSink.readOutput(sink1));
Set<StructuredRecord> actualJoined = new HashSet<>(MockSink.readOutput(sink2));
return expectedAggregates2.equals(actualAggs) && expectedJoined2.equals(actualJoined);
}, 1, TimeUnit.MINUTES);
sparkManager.stop();
MockSink.clear(sink1);
MockSink.clear(sink2);
}
Aggregations