use of io.cdap.cdap.api.data.format.StructuredRecord in project cdap by caskdata.
the class AutoJoinerTest method testAutoJoinWithMacros.
private void testAutoJoinWithMacros(Engine engine, List<String> required, Schema expectedSchema, Set<StructuredRecord> expectedRecords, boolean excludeUsers, boolean excludePurchases) throws Exception {
/*
users ------|
|--> join --> sink
purchases --|
joinOn: users.region = purchases.region and users.user_id = purchases.user_id
*/
String userInput = UUID.randomUUID().toString();
String purchaseInput = UUID.randomUUID().toString();
String output = UUID.randomUUID().toString();
Map<String, String> joinerProps = new HashMap<>();
joinerProps.put(MockAutoJoiner.Conf.STAGES, "${stages}");
joinerProps.put(MockAutoJoiner.Conf.KEY, "${key}");
joinerProps.put(MockAutoJoiner.Conf.REQUIRED, "${required}");
joinerProps.put(MockAutoJoiner.Conf.SELECT, "${select}");
if (engine == Engine.SPARK || (required.size() < 2 && engine == Engine.MAPREDUCE)) {
joinerProps.put(MockAutoJoiner.Conf.SCHEMA, "${schema}");
}
ETLBatchConfig config = ETLBatchConfig.builder().addStage(new ETLStage("users", MockSource.getPlugin(userInput))).addStage(new ETLStage("purchases", MockSource.getPlugin(purchaseInput))).addStage(new ETLStage("join", new ETLPlugin(MockAutoJoiner.NAME, BatchJoiner.PLUGIN_TYPE, joinerProps))).addStage(new ETLStage("sink", MockSink.getPlugin(output))).addConnection("users", "join").addConnection("purchases", "join").addConnection("join", "sink").setEngine(engine).build();
AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, config);
ApplicationId appId = NamespaceId.DEFAULT.app(UUID.randomUUID().toString());
ApplicationManager appManager = deployApplication(appId, appRequest);
// write input data
if (!excludeUsers) {
List<StructuredRecord> userData = Arrays.asList(USER_ALICE, USER_ALYCE, USER_BOB);
DataSetManager<Table> inputManager = getDataset(userInput);
MockSource.writeInput(inputManager, userData);
}
if (!excludePurchases) {
List<StructuredRecord> purchaseData = new ArrayList<>();
purchaseData.add(StructuredRecord.builder(PURCHASE_SCHEMA).set("region", "us").set("user_id", 0).set("purchase_id", 123).build());
purchaseData.add(StructuredRecord.builder(PURCHASE_SCHEMA).set("region", "us").set("user_id", 2).set("purchase_id", 456).build());
DataSetManager<Table> inputManager = getDataset(purchaseInput);
MockSource.writeInput(inputManager, purchaseData);
}
WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
List<JoinField> selectedFields = new ArrayList<>();
selectedFields.add(new JoinField("purchases", "region"));
selectedFields.add(new JoinField("purchases", "purchase_id"));
selectedFields.add(new JoinField("purchases", "user_id"));
selectedFields.add(new JoinField("users", "name"));
Map<String, String> joinerProperties = MockAutoJoiner.getProperties(Arrays.asList("purchases", "users"), Arrays.asList("region", "user_id"), required, Collections.emptyList(), selectedFields, true);
Map<String, String> runtimeArgs = new HashMap<>();
runtimeArgs.put("stages", joinerProperties.get(MockAutoJoiner.Conf.STAGES));
runtimeArgs.put("key", joinerProperties.get(MockAutoJoiner.Conf.KEY));
runtimeArgs.put("required", joinerProperties.get(MockAutoJoiner.Conf.REQUIRED));
runtimeArgs.put("select", joinerProperties.get(MockAutoJoiner.Conf.SELECT));
runtimeArgs.put("schema", expectedSchema.toString());
workflowManager.startAndWaitForGoodRun(runtimeArgs, ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
DataSetManager<Table> outputManager = getDataset(output);
List<StructuredRecord> outputRecords = MockSink.readOutput(outputManager);
Assert.assertEquals(expectedRecords, new HashSet<>(outputRecords));
}
use of io.cdap.cdap.api.data.format.StructuredRecord in project cdap by caskdata.
the class AutoJoinerTest method testAutoInnerJoinUsingSQLEngineWithExcludedStages.
@Test
public void testAutoInnerJoinUsingSQLEngineWithExcludedStages() throws Exception {
Schema expectedSchema = Schema.recordOf("purchases.users", Schema.Field.of("purchases_region", Schema.of(Schema.Type.STRING)), Schema.Field.of("purchases_purchase_id", Schema.of(Schema.Type.INT)), Schema.Field.of("purchases_user_id", Schema.of(Schema.Type.INT)), Schema.Field.of("users_region", Schema.of(Schema.Type.STRING)), Schema.Field.of("users_user_id", Schema.of(Schema.Type.INT)), Schema.Field.of("users_name", Schema.of(Schema.Type.STRING)));
Set<StructuredRecord> expected = new HashSet<>();
expected.add(StructuredRecord.builder(expectedSchema).set("purchases_region", "us").set("purchases_purchase_id", 123).set("purchases_user_id", 0).set("users_region", "us").set("users_user_id", 0).set("users_name", "alice").build());
testSimpleAutoJoinUsingSQLEngineWithStageSettings(Arrays.asList("users", "purchases"), Collections.singletonList("users"), expected, expectedSchema, "", "join", Engine.SPARK);
}
use of io.cdap.cdap.api.data.format.StructuredRecord in project cdap by caskdata.
the class AutoJoinerTest method testInnerBetweenCondition.
@Test
public void testInnerBetweenCondition() throws Exception {
/*
users ----------|
|--> join --> sink
age_groups -----|
joinOn: users.age > age_groups.lo and (users.age <= age_groups.hi or age_groups.hi is null)
*/
Schema userSchema = Schema.recordOf("user", Schema.Field.of("name", Schema.of(Schema.Type.STRING)), Schema.Field.of("age", Schema.nullableOf(Schema.of(Schema.Type.INT))));
Schema ageGroupSchema = Schema.recordOf("age_group", Schema.Field.of("name", Schema.of(Schema.Type.STRING)), Schema.Field.of("lo", Schema.of(Schema.Type.INT)), Schema.Field.of("hi", Schema.nullableOf(Schema.of(Schema.Type.INT))));
Schema expectedSchema = Schema.recordOf("users.age_groups", Schema.Field.of("username", Schema.of(Schema.Type.STRING)), Schema.Field.of("age_group", Schema.of(Schema.Type.STRING)));
String userInput = UUID.randomUUID().toString();
String agesInput = UUID.randomUUID().toString();
String output = UUID.randomUUID().toString();
List<JoinField> select = new ArrayList<>();
select.add(new JoinField("users", "name", "username"));
select.add(new JoinField("age_groups", "name", "age_group"));
JoinCondition.OnExpression condition = JoinCondition.onExpression().setExpression("users.age >= age_groups.lo and (users.age < age_groups.hi or age_groups.hi is null)").build();
Map<String, String> joinerProperties = MockAutoJoiner.getProperties(Arrays.asList("users", "age_groups"), Collections.emptyList(), Arrays.asList("users", "age_groups"), Collections.emptyList(), select, false, null, condition);
ETLBatchConfig config = ETLBatchConfig.builder().addStage(new ETLStage("users", MockSource.getPlugin(userInput, userSchema))).addStage(new ETLStage("age_groups", MockSource.getPlugin(agesInput, ageGroupSchema))).addStage(new ETLStage("join", new ETLPlugin(MockAutoJoiner.NAME, BatchJoiner.PLUGIN_TYPE, joinerProperties))).addStage(new ETLStage("sink", MockSink.getPlugin(output))).addConnection("users", "join").addConnection("age_groups", "join").addConnection("join", "sink").setEngine(Engine.SPARK).build();
AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, config);
ApplicationId appId = NamespaceId.DEFAULT.app(UUID.randomUUID().toString());
ApplicationManager appManager = deployApplication(appId, appRequest);
List<StructuredRecord> records = new ArrayList<>();
records.add(StructuredRecord.builder(userSchema).set("name", "Alice").set("age", 35).build());
records.add(StructuredRecord.builder(userSchema).set("name", "Bob").build());
records.add(StructuredRecord.builder(userSchema).set("name", "Carl").set("age", 13).build());
records.add(StructuredRecord.builder(userSchema).set("name", "Dave").set("age", 0).build());
records.add(StructuredRecord.builder(userSchema).set("name", "Elaine").set("age", 68).build());
records.add(StructuredRecord.builder(userSchema).set("name", "Fred").set("age", 4).build());
DataSetManager<Table> inputManager = getDataset(userInput);
MockSource.writeInput(inputManager, records);
records.clear();
records.add(StructuredRecord.builder(ageGroupSchema).set("name", "infant").set("lo", 0).set("hi", 2).build());
records.add(StructuredRecord.builder(ageGroupSchema).set("name", "toddler").set("lo", 2).set("hi", 5).build());
records.add(StructuredRecord.builder(ageGroupSchema).set("name", "child").set("lo", 5).set("hi", 13).build());
records.add(StructuredRecord.builder(ageGroupSchema).set("name", "teen").set("lo", 13).set("hi", 20).build());
records.add(StructuredRecord.builder(ageGroupSchema).set("name", "adult").set("lo", 20).set("hi", 65).build());
records.add(StructuredRecord.builder(ageGroupSchema).set("name", "senior").set("lo", 65).build());
inputManager = getDataset(agesInput);
MockSource.writeInput(inputManager, records);
WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
workflowManager.startAndWaitForGoodRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
DataSetManager<Table> outputManager = getDataset(output);
List<StructuredRecord> outputRecords = MockSink.readOutput(outputManager);
Set<StructuredRecord> expected = new HashSet<>();
expected.add(StructuredRecord.builder(expectedSchema).set("username", "Alice").set("age_group", "adult").build());
expected.add(StructuredRecord.builder(expectedSchema).set("username", "Carl").set("age_group", "teen").build());
expected.add(StructuredRecord.builder(expectedSchema).set("username", "Dave").set("age_group", "infant").build());
expected.add(StructuredRecord.builder(expectedSchema).set("username", "Elaine").set("age_group", "senior").build());
expected.add(StructuredRecord.builder(expectedSchema).set("username", "Fred").set("age_group", "toddler").build());
Assert.assertEquals(expected, new HashSet<>(outputRecords));
validateMetric(6, appId, "users.records.out");
validateMetric(6, appId, "age_groups.records.out");
validateMetric(12, appId, "join.records.in");
validateMetric(expected.size(), appId, "join.records.out");
}
use of io.cdap.cdap.api.data.format.StructuredRecord in project cdap by caskdata.
the class AutoJoinerTest method testNullNotEqual.
@Test
public void testNullNotEqual() throws Exception {
Schema expectedSchema = Schema.recordOf("items.attributes", Schema.Field.of("items_id", Schema.nullableOf(Schema.of(Schema.Type.INT))), Schema.Field.of("items_region", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("items_name", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("attributes_region", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("attributes_id", Schema.nullableOf(Schema.of(Schema.Type.INT))), Schema.Field.of("attributes_attr", Schema.nullableOf(Schema.of(Schema.Type.STRING))));
Set<StructuredRecord> expected = new HashSet<>();
expected.add(StructuredRecord.builder(expectedSchema).set("items_id", 0).set("items_region", "us").set("items_name", "bacon").set("attributes_region", "us").set("attributes_id", 0).set("attributes_attr", "food").build());
expected.add(StructuredRecord.builder(expectedSchema).set("items_id", 1).build());
expected.add(StructuredRecord.builder(expectedSchema).set("items_region", "us").build());
testNullEquality(Engine.SPARK, false, expected);
testNullEquality(Engine.MAPREDUCE, false, expected);
}
use of io.cdap.cdap.api.data.format.StructuredRecord in project cdap by caskdata.
the class AutoJoinerTest method testAutoOuterJoin.
@Test
public void testAutoOuterJoin() throws Exception {
Schema expectedSchema = Schema.recordOf("purchases.users", Schema.Field.of("purchases_region", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("purchases_purchase_id", Schema.nullableOf(Schema.of(Schema.Type.INT))), Schema.Field.of("purchases_user_id", Schema.nullableOf(Schema.of(Schema.Type.INT))), Schema.Field.of("users_region", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("users_user_id", Schema.nullableOf(Schema.of(Schema.Type.INT))), Schema.Field.of("users_name", Schema.nullableOf(Schema.of(Schema.Type.STRING))));
Set<StructuredRecord> expected = new HashSet<>();
expected.add(StructuredRecord.builder(expectedSchema).set("purchases_region", "us").set("purchases_purchase_id", 123).set("purchases_user_id", 0).set("users_region", "us").set("users_user_id", 0).set("users_name", "alice").build());
expected.add(StructuredRecord.builder(expectedSchema).set("purchases_region", "us").set("purchases_purchase_id", 456).set("purchases_user_id", 2).build());
expected.add(StructuredRecord.builder(expectedSchema).set("users_region", "eu").set("users_user_id", 0).set("users_name", "alyce").build());
expected.add(StructuredRecord.builder(expectedSchema).set("users_region", "us").set("users_user_id", 1).set("users_name", "bob").build());
testSimpleAutoJoin(Collections.emptyList(), expected, Engine.SPARK);
testSimpleAutoJoin(Collections.emptyList(), expected, Engine.MAPREDUCE);
}
Aggregations