use of io.cdap.cdap.api.data.format.StructuredRecord in project cdap by caskdata.
the class DataPipelineTest method testSimpleConditionWithMultipleOutputActions.
@Test
public void testSimpleConditionWithMultipleOutputActions() throws Exception {
Schema schema = Schema.recordOf("testRecord", Schema.Field.of("name", Schema.of(Schema.Type.STRING)));
/*
*
* condition --Action--> file ---> trueSink
* |
* |--->Action--->file----> falseSink
*
*/
String appName = "SimpleConditionWithMultipleOutputActions";
String trueSource = "true" + appName + "Source";
String falseSource = "false" + appName + "Source";
String trueSink = "true" + appName + "Sink";
String falseSink = "false" + appName + "Sink";
String actionTable = "actionTable" + appName;
ETLBatchConfig etlConfig = ETLBatchConfig.builder().addStage(new ETLStage("trueSource", MockSource.getPlugin(trueSource, schema))).addStage(new ETLStage("falseSource", MockSource.getPlugin(falseSource, schema))).addStage(new ETLStage("trueSink", MockSink.getPlugin(trueSink))).addStage(new ETLStage("falseSink", MockSink.getPlugin(falseSink))).addStage(new ETLStage("condition", MockCondition.getPlugin("condition"))).addStage(new ETLStage("action1", MockAction.getPlugin(actionTable, "row1", "key1", "val1"))).addStage(new ETLStage("action2", MockAction.getPlugin(actionTable, "row2", "key2", "val2"))).addConnection("condition", "action1", true).addConnection("action1", "trueSource").addConnection("trueSource", "trueSink").addConnection("condition", "action2", false).addConnection("action2", "falseSource").addConnection("falseSource", "falseSink").build();
AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT_RANGE, etlConfig);
ApplicationId appId = NamespaceId.DEFAULT.app(appName);
ApplicationManager appManager = deployApplication(appId, appRequest);
StructuredRecord recordSamuel = StructuredRecord.builder(schema).set("name", "samuel").build();
StructuredRecord recordBob = StructuredRecord.builder(schema).set("name", "bob").build();
for (String branch : Arrays.asList("true", "false")) {
String source = branch.equals("true") ? trueSource : falseSource;
String sink = branch.equals("true") ? trueSink : falseSink;
// write records to source
DataSetManager<Table> inputManager = getDataset(NamespaceId.DEFAULT.dataset(source));
MockSource.writeInput(inputManager, ImmutableList.of(recordSamuel, recordBob));
WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
workflowManager.start(ImmutableMap.of("condition.branch.to.execute", branch));
if (branch.equals("true")) {
workflowManager.waitForRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
} else {
workflowManager.waitForRuns(ProgramRunStatus.COMPLETED, 2, 5, TimeUnit.MINUTES);
}
// check sink
DataSetManager<Table> sinkManager = getDataset(sink);
Set<StructuredRecord> expected = ImmutableSet.of(recordSamuel, recordBob);
Set<StructuredRecord> actual = Sets.newHashSet(MockSink.readOutput(sinkManager));
Assert.assertEquals(expected, actual);
validateMetric(2, appId, branch + "Source.records.out");
validateMetric(2, appId, branch + "Sink.records.in");
// check Action1 and Action2 is executed correctly
DataSetManager<Table> actionTableDS = getDataset(actionTable);
if (branch.equals("true")) {
Assert.assertEquals("val1", MockAction.readOutput(actionTableDS, "row1", "key1"));
} else {
Assert.assertEquals("val2", MockAction.readOutput(actionTableDS, "row2", "key2"));
}
}
}
use of io.cdap.cdap.api.data.format.StructuredRecord in project cdap by caskdata.
the class AutoJoinerTest method testTripleAutoJoin.
private void testTripleAutoJoin(List<String> required, List<String> broadcast, Set<StructuredRecord> expected, Engine engine, List<String> tablesInOrderToJoin) throws Exception {
/*
users ------|
|
purchases --|--> join --> sink
|
interests --|
joinOn: users.region = purchases.region = interests.region and
users.user_id = purchases.user_id = interests.user_id
*/
String userInput = UUID.randomUUID().toString();
String purchaseInput = UUID.randomUUID().toString();
String interestInput = UUID.randomUUID().toString();
String output = UUID.randomUUID().toString();
ETLBatchConfig config = ETLBatchConfig.builder().addStage(new ETLStage("users", MockSource.getPlugin(userInput, USER_SCHEMA))).addStage(new ETLStage("purchases", MockSource.getPlugin(purchaseInput, PURCHASE_SCHEMA))).addStage(new ETLStage("interests", MockSource.getPlugin(interestInput, INTEREST_SCHEMA))).addStage(new ETLStage("join", MockAutoJoiner.getPlugin(tablesInOrderToJoin, Arrays.asList("region", "user_id"), required, broadcast, Collections.emptyList(), true))).addStage(new ETLStage("sink", MockSink.getPlugin(output))).addConnection("users", "join").addConnection("purchases", "join").addConnection("interests", "join").addConnection("join", "sink").setEngine(engine).build();
AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, config);
ApplicationId appId = NamespaceId.DEFAULT.app(UUID.randomUUID().toString());
ApplicationManager appManager = deployApplication(appId, appRequest);
// write input data
List<StructuredRecord> userData = Arrays.asList(USER_ALICE, USER_ALYCE, USER_BOB);
DataSetManager<Table> inputManager = getDataset(userInput);
MockSource.writeInput(inputManager, userData);
List<StructuredRecord> purchaseData = new ArrayList<>();
purchaseData.add(StructuredRecord.builder(PURCHASE_SCHEMA).set("region", "us").set("user_id", 0).set("purchase_id", 123).build());
purchaseData.add(StructuredRecord.builder(PURCHASE_SCHEMA).set("region", "us").set("user_id", 2).set("purchase_id", 456).build());
inputManager = getDataset(purchaseInput);
MockSource.writeInput(inputManager, purchaseData);
List<StructuredRecord> interestData = new ArrayList<>();
interestData.add(StructuredRecord.builder(INTEREST_SCHEMA).set("region", "us").set("user_id", 0).set("interest", "food").build());
interestData.add(StructuredRecord.builder(INTEREST_SCHEMA).set("region", "us").set("user_id", 0).set("interest", "sports").build());
interestData.add(StructuredRecord.builder(INTEREST_SCHEMA).set("region", "us").set("user_id", 1).set("interest", "gardening").build());
interestData.add(StructuredRecord.builder(INTEREST_SCHEMA).set("region", "us").set("user_id", 2).set("interest", "gaming").build());
inputManager = getDataset(interestInput);
MockSource.writeInput(inputManager, interestData);
WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
workflowManager.startAndWaitForGoodRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
DataSetManager<Table> outputManager = getDataset(output);
List<StructuredRecord> outputRecords = MockSink.readOutput(outputManager);
Set<StructuredRecord> actual = new HashSet<>();
Schema expectedSchema = expected.iterator().hasNext() ? expected.iterator().next().getSchema() : null;
if (expectedSchema == null || expected.iterator().next().getSchema() == outputRecords.get(0).getSchema()) {
actual = new HashSet<>(outputRecords);
} else {
// reorder the output columns of the join result (actual) to match the column order of expected
for (StructuredRecord sr : outputRecords) {
actual.add(StructuredRecord.builder(expectedSchema).set("purchases_region", sr.get("purchases_region")).set("purchases_purchase_id", sr.get("purchases_purchase_id")).set("purchases_user_id", sr.get("purchases_user_id")).set("users_region", sr.get("users_region")).set("users_user_id", sr.get("users_user_id")).set("users_name", sr.get("users_name")).set("interests_region", sr.get("interests_region")).set("interests_user_id", sr.get("interests_user_id")).set("interests_interest", sr.get("interests_interest")).build());
}
}
Assert.assertEquals(expected, actual);
validateMetric(9, appId, "join.records.in");
validateMetric(expected.size(), appId, "join.records.out");
}
use of io.cdap.cdap.api.data.format.StructuredRecord in project cdap by caskdata.
the class AutoJoinerTest method testTripleAutoNoneRequiredJoin.
@Test
public void testTripleAutoNoneRequiredJoin() throws Exception {
/*
In this case, all the JOINS will be full outer joins
i.e.
Purchases (outer) Users (outer) Interests
*/
Schema expectedSchema = Schema.recordOf("purchases.users.interests", Schema.Field.of("purchases_region", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("purchases_purchase_id", Schema.nullableOf(Schema.of(Schema.Type.INT))), Schema.Field.of("purchases_user_id", Schema.nullableOf(Schema.of(Schema.Type.INT))), Schema.Field.of("users_region", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("users_user_id", Schema.nullableOf(Schema.of(Schema.Type.INT))), Schema.Field.of("users_name", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("interests_region", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("interests_user_id", Schema.nullableOf(Schema.of(Schema.Type.INT))), Schema.Field.of("interests_interest", Schema.nullableOf(Schema.of(Schema.Type.STRING))));
Set<StructuredRecord> expected = new HashSet<>();
expected.add(StructuredRecord.builder(expectedSchema).set("purchases_region", "us").set("purchases_purchase_id", 456).set("purchases_user_id", 2).set("interests_region", "us").set("interests_user_id", 2).set("interests_interest", "gaming").build());
expected.add(StructuredRecord.builder(expectedSchema).set("users_region", "eu").set("users_user_id", 0).set("users_name", "alyce").build());
expected.add(StructuredRecord.builder(expectedSchema).set("purchases_region", "us").set("purchases_purchase_id", 123).set("purchases_user_id", 0).set("users_region", "us").set("users_user_id", 0).set("users_name", "alice").set("interests_region", "us").set("interests_user_id", 0).set("interests_interest", "food").build());
expected.add(StructuredRecord.builder(expectedSchema).set("purchases_region", "us").set("purchases_purchase_id", 123).set("purchases_user_id", 0).set("users_region", "us").set("users_user_id", 0).set("users_name", "alice").set("interests_region", "us").set("interests_user_id", 0).set("interests_interest", "sports").build());
expected.add(StructuredRecord.builder(expectedSchema).set("users_region", "us").set("users_user_id", 1).set("users_name", "bob").set("interests_region", "us").set("interests_user_id", 1).set("interests_interest", "gardening").build());
/*
The output should not be affected by order of joins
*/
testTripleAutoJoin(Collections.emptyList(), expected, Engine.SPARK, Arrays.asList("purchases", "users", "interests"));
testTripleAutoJoin(Collections.emptyList(), expected, Engine.SPARK, Arrays.asList("purchases", "interests", "users"));
testTripleAutoJoin(Collections.emptyList(), expected, Engine.SPARK, Arrays.asList("users", "purchases", "interests"));
}
use of io.cdap.cdap.api.data.format.StructuredRecord in project cdap by caskdata.
the class AutoJoinerTest method testDoubleBroadcastJoin.
@Test
public void testDoubleBroadcastJoin() throws Exception {
Schema expectedSchema = Schema.recordOf("purchases.users.interests", Schema.Field.of("purchases_region", Schema.of(Schema.Type.STRING)), Schema.Field.of("purchases_purchase_id", Schema.of(Schema.Type.INT)), Schema.Field.of("purchases_user_id", Schema.of(Schema.Type.INT)), Schema.Field.of("users_region", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("users_user_id", Schema.nullableOf(Schema.of(Schema.Type.INT))), Schema.Field.of("users_name", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("interests_region", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("interests_user_id", Schema.nullableOf(Schema.of(Schema.Type.INT))), Schema.Field.of("interests_interest", Schema.nullableOf(Schema.of(Schema.Type.STRING))));
Set<StructuredRecord> expected = new HashSet<>();
expected.add(StructuredRecord.builder(expectedSchema).set("purchases_region", "us").set("purchases_purchase_id", 123).set("purchases_user_id", 0).set("users_region", "us").set("users_user_id", 0).set("users_name", "alice").set("interests_region", "us").set("interests_user_id", 0).set("interests_interest", "food").build());
expected.add(StructuredRecord.builder(expectedSchema).set("purchases_region", "us").set("purchases_purchase_id", 123).set("purchases_user_id", 0).set("users_region", "us").set("users_user_id", 0).set("users_name", "alice").set("interests_region", "us").set("interests_user_id", 0).set("interests_interest", "sports").build());
expected.add(StructuredRecord.builder(expectedSchema).set("purchases_region", "us").set("purchases_purchase_id", 456).set("purchases_user_id", 2).set("interests_region", "us").set("interests_user_id", 2).set("interests_interest", "gaming").build());
testTripleAutoJoin(Collections.singletonList("purchases"), Arrays.asList("purchases", "interests"), expected, Engine.SPARK, Arrays.asList("purchases", "users", "interests"));
}
use of io.cdap.cdap.api.data.format.StructuredRecord in project cdap by caskdata.
the class AutoJoinerTest method testAutoJoinWithMacrosAndEmptyInput.
@Test
public void testAutoJoinWithMacrosAndEmptyInput() throws Exception {
Schema expectedSchema = Schema.recordOf("joined", Schema.Field.of("region", Schema.of(Schema.Type.STRING)), Schema.Field.of("purchase_id", Schema.of(Schema.Type.INT)), Schema.Field.of("user_id", Schema.of(Schema.Type.INT)), Schema.Field.of("name", Schema.nullableOf(Schema.of(Schema.Type.STRING))));
Set<StructuredRecord> expected = new HashSet<>();
expected.add(StructuredRecord.builder(expectedSchema).set("region", "us").set("purchase_id", 123).set("user_id", 0).build());
expected.add(StructuredRecord.builder(expectedSchema).set("region", "us").set("purchase_id", 456).set("user_id", 2).build());
// right side empty
testAutoJoinWithMacros(Engine.SPARK, Collections.singletonList("purchases"), expectedSchema, expected, true, false);
testAutoJoinWithMacros(Engine.MAPREDUCE, Collections.singletonList("purchases"), expectedSchema, expected, true, false);
// left side empty
expected.clear();
testAutoJoinWithMacros(Engine.MAPREDUCE, Collections.singletonList("purchases"), expectedSchema, expected, false, true);
testAutoJoinWithMacros(Engine.SPARK, Collections.singletonList("purchases"), expectedSchema, expected, false, true);
// both sides empty
expected.clear();
testAutoJoinWithMacros(Engine.MAPREDUCE, Collections.singletonList("purchases"), expectedSchema, expected, true, true);
testAutoJoinWithMacros(Engine.SPARK, Collections.singletonList("purchases"), expectedSchema, expected, true, true);
}
Aggregations