use of io.cdap.cdap.api.data.format.StructuredRecord in project cdap by caskdata.
the class AutoJoinerTest method testTripleAutoTwoRequiredJoin.
@Test
public void testTripleAutoTwoRequiredJoin() throws Exception {
Schema expectedSchema = Schema.recordOf("purchases.users.interests", Schema.Field.of("purchases_region", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("purchases_purchase_id", Schema.nullableOf(Schema.of(Schema.Type.INT))), Schema.Field.of("purchases_user_id", Schema.nullableOf(Schema.of(Schema.Type.INT))), Schema.Field.of("users_region", Schema.of(Schema.Type.STRING)), Schema.Field.of("users_user_id", Schema.of(Schema.Type.INT)), Schema.Field.of("users_name", Schema.of(Schema.Type.STRING)), Schema.Field.of("interests_region", Schema.of(Schema.Type.STRING)), Schema.Field.of("interests_user_id", Schema.of(Schema.Type.INT)), Schema.Field.of("interests_interest", Schema.of(Schema.Type.STRING)));
Set<StructuredRecord> expected = new HashSet<>();
expected.add(StructuredRecord.builder(expectedSchema).set("purchases_region", "us").set("purchases_purchase_id", 123).set("purchases_user_id", 0).set("users_region", "us").set("users_user_id", 0).set("users_name", "alice").set("interests_region", "us").set("interests_user_id", 0).set("interests_interest", "food").build());
expected.add(StructuredRecord.builder(expectedSchema).set("purchases_region", "us").set("purchases_purchase_id", 123).set("purchases_user_id", 0).set("users_region", "us").set("users_user_id", 0).set("users_name", "alice").set("interests_region", "us").set("interests_user_id", 0).set("interests_interest", "sports").build());
expected.add(StructuredRecord.builder(expectedSchema).set("users_region", "us").set("users_user_id", 1).set("users_name", "bob").set("interests_region", "us").set("interests_user_id", 1).set("interests_interest", "gardening").build());
testTripleAutoJoin(Arrays.asList("users", "interests"), expected, Engine.SPARK, Collections.emptyList());
testTripleAutoJoin(Arrays.asList("users", "interests"), expected, Engine.MAPREDUCE, Collections.emptyList());
}
use of io.cdap.cdap.api.data.format.StructuredRecord in project cdap by caskdata.
the class AutoJoinerTest method testBroadcastJoin.
@Test
public void testBroadcastJoin() throws Exception {
Schema expectedSchema = Schema.recordOf("purchases.users", Schema.Field.of("purchases_region", Schema.of(Schema.Type.STRING)), Schema.Field.of("purchases_purchase_id", Schema.of(Schema.Type.INT)), Schema.Field.of("purchases_user_id", Schema.of(Schema.Type.INT)), Schema.Field.of("users_region", Schema.of(Schema.Type.STRING)), Schema.Field.of("users_user_id", Schema.of(Schema.Type.INT)), Schema.Field.of("users_name", Schema.of(Schema.Type.STRING)));
Set<StructuredRecord> expected = new HashSet<>();
expected.add(StructuredRecord.builder(expectedSchema).set("purchases_region", "us").set("purchases_purchase_id", 123).set("purchases_user_id", 0).set("users_region", "us").set("users_user_id", 0).set("users_name", "alice").build());
testSimpleAutoJoin(Arrays.asList("users", "purchases"), Collections.singletonList("users"), expected, Engine.SPARK);
testSimpleAutoJoin(Arrays.asList("users", "purchases"), Collections.singletonList("purchases"), expected, Engine.SPARK);
}
use of io.cdap.cdap.api.data.format.StructuredRecord in project cdap by caskdata.
the class AutoJoinerTest method testAutoLeftOuterJoinSkewed.
@Test
public void testAutoLeftOuterJoinSkewed() throws Exception {
Schema expectedSchema = Schema.recordOf("interests.users", Schema.Field.of("interests_region", Schema.of(Schema.Type.STRING)), Schema.Field.of("interests_user_id", Schema.of(Schema.Type.INT)), Schema.Field.of("interests_interest", Schema.of(Schema.Type.STRING)), Schema.Field.of("users_region", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("users_user_id", Schema.nullableOf(Schema.of(Schema.Type.INT))), Schema.Field.of("users_name", Schema.nullableOf(Schema.of(Schema.Type.STRING))));
Set<StructuredRecord> expected = new HashSet<>();
expected.add(StructuredRecord.builder(expectedSchema).set("interests_region", "us").set("interests_interest", "hiking").set("interests_user_id", 0).set("users_region", "us").set("users_user_id", 0).set("users_name", "alice").build());
expected.add(StructuredRecord.builder(expectedSchema).set("interests_region", "us").set("interests_interest", "running").set("interests_user_id", 0).set("users_region", "us").set("users_user_id", 0).set("users_name", "alice").build());
expected.add(StructuredRecord.builder(expectedSchema).set("interests_region", "us").set("interests_interest", "cooking").set("interests_user_id", 0).set("users_region", "us").set("users_user_id", 0).set("users_name", "alice").build());
expected.add(StructuredRecord.builder(expectedSchema).set("interests_region", "us").set("interests_interest", "hiking").set("interests_user_id", 1).set("users_region", "us").set("users_user_id", 1).set("users_name", "bob").build());
testSimpleAutoJoinSkewed(Collections.singletonList("interests"), expected, Engine.SPARK);
testSimpleAutoJoinSkewed(Collections.singletonList("interests"), expected, Engine.MAPREDUCE);
}
use of io.cdap.cdap.api.data.format.StructuredRecord in project cdap by caskdata.
the class AutoJoinerTest method testQuadAutoJoin.
private void testQuadAutoJoin(List<String> required, List<String> broadcast, Set<StructuredRecord> expected, Engine engine, List<String> tablesInOrderToJoin) throws Exception {
/*
users ------|
|
purchases --|--> join --> sink
|
interests --|
|
age --------|
joinOn: users.region = purchases.region = interests.region = age.region and
users.user_id = purchases.user_id = interests.user_id = age.user_id
*/
String userInput = UUID.randomUUID().toString();
String purchaseInput = UUID.randomUUID().toString();
String interestInput = UUID.randomUUID().toString();
String ageInput = UUID.randomUUID().toString();
String output = UUID.randomUUID().toString();
ETLBatchConfig config = ETLBatchConfig.builder().addStage(new ETLStage("users", MockSource.getPlugin(userInput, USER_SCHEMA))).addStage(new ETLStage("purchases", MockSource.getPlugin(purchaseInput, PURCHASE_SCHEMA))).addStage(new ETLStage("interests", MockSource.getPlugin(interestInput, INTEREST_SCHEMA))).addStage(new ETLStage("ages", MockSource.getPlugin(ageInput, AGE_SCHEMA))).addStage(new ETLStage("join", MockAutoJoiner.getPlugin(tablesInOrderToJoin, Arrays.asList("region", "user_id"), required, broadcast, Collections.emptyList(), true))).addStage(new ETLStage("sink", MockSink.getPlugin(output))).addConnection("users", "join").addConnection("purchases", "join").addConnection("interests", "join").addConnection("ages", "join").addConnection("join", "sink").setEngine(engine).build();
AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, config);
ApplicationId appId = NamespaceId.DEFAULT.app(UUID.randomUUID().toString());
ApplicationManager appManager = deployApplication(appId, appRequest);
// write input data
List<StructuredRecord> userData = Arrays.asList(USER_ALICE, USER_ALYCE, USER_BOB);
DataSetManager<Table> inputManager = getDataset(userInput);
MockSource.writeInput(inputManager, userData);
List<StructuredRecord> purchaseData = new ArrayList<>();
purchaseData.add(StructuredRecord.builder(PURCHASE_SCHEMA).set("region", "us").set("user_id", 0).set("purchase_id", 123).build());
purchaseData.add(StructuredRecord.builder(PURCHASE_SCHEMA).set("region", "us").set("user_id", 2).set("purchase_id", 456).build());
inputManager = getDataset(purchaseInput);
MockSource.writeInput(inputManager, purchaseData);
List<StructuredRecord> interestData = new ArrayList<>();
interestData.add(StructuredRecord.builder(INTEREST_SCHEMA).set("region", "us").set("user_id", 0).set("interest", "food").build());
interestData.add(StructuredRecord.builder(INTEREST_SCHEMA).set("region", "us").set("user_id", 0).set("interest", "sports").build());
interestData.add(StructuredRecord.builder(INTEREST_SCHEMA).set("region", "us").set("user_id", 1).set("interest", "gardening").build());
interestData.add(StructuredRecord.builder(INTEREST_SCHEMA).set("region", "us").set("user_id", 2).set("interest", "gaming").build());
inputManager = getDataset(interestInput);
MockSource.writeInput(inputManager, interestData);
List<StructuredRecord> ageData = new ArrayList<>();
ageData.add(StructuredRecord.builder(AGE_SCHEMA).set("region", "us").set("user_id", 10).set("age", 20).build());
ageData.add(StructuredRecord.builder(AGE_SCHEMA).set("region", "us").set("user_id", 1).set("age", 30).build());
inputManager = getDataset(ageInput);
MockSource.writeInput(inputManager, ageData);
WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
workflowManager.startAndWaitForGoodRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
DataSetManager<Table> outputManager = getDataset(output);
List<StructuredRecord> outputRecords = MockSink.readOutput(outputManager);
Set<StructuredRecord> actual = new HashSet<>();
Schema expectedSchema = expected.iterator().hasNext() ? expected.iterator().next().getSchema() : null;
if (expectedSchema == null || expected.iterator().next().getSchema() == outputRecords.get(0).getSchema()) {
actual = new HashSet<>(outputRecords);
} else {
// reorder the output columns of the join result (actual) to match the column order of expected
for (StructuredRecord sr : outputRecords) {
actual.add(StructuredRecord.builder(expectedSchema).set("ages_region", sr.get("ages_region")).set("ages_age", sr.get("ages_age")).set("ages_user_id", sr.get("ages_user_id")).set("purchases_region", sr.get("purchases_region")).set("purchases_purchase_id", sr.get("purchases_purchase_id")).set("purchases_user_id", sr.get("purchases_user_id")).set("users_region", sr.get("users_region")).set("users_user_id", sr.get("users_user_id")).set("users_name", sr.get("users_name")).set("interests_region", sr.get("interests_region")).set("interests_user_id", sr.get("interests_user_id")).set("interests_interest", sr.get("interests_interest")).build());
}
}
Assert.assertEquals(expected, actual);
validateMetric(11, appId, "join.records.in");
validateMetric(expected.size(), appId, "join.records.out");
}
use of io.cdap.cdap.api.data.format.StructuredRecord in project cdap by caskdata.
the class AutoJoinerTest method testCaseSensitivity.
@Test
public void testCaseSensitivity() throws Exception {
Schema weird1 = Schema.recordOf("weird1", Schema.Field.of("id", Schema.of(Schema.Type.INT)), Schema.Field.of("ID", Schema.of(Schema.Type.LONG)), Schema.Field.of("Id", Schema.of(Schema.Type.INT)), Schema.Field.of("name", Schema.of(Schema.Type.STRING)));
Schema weird2 = Schema.recordOf("weird2", Schema.Field.of("id", Schema.of(Schema.Type.INT)), Schema.Field.of("ID", Schema.of(Schema.Type.LONG)), Schema.Field.of("val", Schema.of(Schema.Type.STRING)));
String input1 = UUID.randomUUID().toString();
String input2 = UUID.randomUUID().toString();
String output = UUID.randomUUID().toString();
ETLBatchConfig config = ETLBatchConfig.builder().addStage(new ETLStage("i1", MockSource.getPlugin(input1, weird1))).addStage(new ETLStage("i2", MockSource.getPlugin(input2, weird2))).addStage(new ETLStage("join", MockAutoJoiner.getPlugin(Arrays.asList("i1", "i2"), Arrays.asList("id", "ID"), Arrays.asList("i1", "i2"), Collections.emptyList(), Collections.emptyList(), true))).addStage(new ETLStage("sink", MockSink.getPlugin(output))).addConnection("i1", "join").addConnection("i2", "join").addConnection("join", "sink").setEngine(Engine.SPARK).build();
AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, config);
ApplicationId appId = NamespaceId.DEFAULT.app(UUID.randomUUID().toString());
ApplicationManager appManager = deployApplication(appId, appRequest);
// write input data
List<StructuredRecord> input1Data = new ArrayList<>();
input1Data.add(StructuredRecord.builder(weird1).set("id", 0).set("ID", 99L).set("Id", 0).set("name", "zero").build());
input1Data.add(StructuredRecord.builder(weird1).set("id", 1).set("ID", 0L).set("Id", 0).set("name", "one").build());
DataSetManager<Table> inputManager = getDataset(input1);
MockSource.writeInput(inputManager, input1Data);
List<StructuredRecord> input2Data = new ArrayList<>();
input2Data.add(StructuredRecord.builder(weird2).set("id", 0).set("ID", 99L).set("val", "0").build());
input2Data.add(StructuredRecord.builder(weird2).set("id", 1).set("ID", 99L).set("val", "1").build());
input2Data.add(StructuredRecord.builder(weird2).set("id", 0).set("ID", 0L).set("val", "2").build());
inputManager = getDataset(input2);
MockSource.writeInput(inputManager, input2Data);
Schema expectedSchema = Schema.recordOf("i1.i2", Schema.Field.of("i1_id", Schema.of(Schema.Type.INT)), Schema.Field.of("i1_ID", Schema.of(Schema.Type.LONG)), Schema.Field.of("i1_Id", Schema.of(Schema.Type.INT)), Schema.Field.of("i1_name", Schema.of(Schema.Type.STRING)), Schema.Field.of("i2_id", Schema.of(Schema.Type.INT)), Schema.Field.of("i2_ID", Schema.of(Schema.Type.LONG)), Schema.Field.of("i2_val", Schema.of(Schema.Type.STRING)));
StructuredRecord expected = StructuredRecord.builder(expectedSchema).set("i1_id", 0).set("i1_ID", 99L).set("i1_Id", 0).set("i1_name", "zero").set("i2_id", 0).set("i2_ID", 99L).set("i2_val", "0").build();
WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
Map<String, String> args = Collections.singletonMap(MockAutoJoiner.PARTITIONS_ARGUMENT, "1");
workflowManager.startAndWaitForGoodRun(args, ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
DataSetManager<Table> outputManager = getDataset(output);
List<StructuredRecord> actual = MockSink.readOutput(outputManager);
Assert.assertEquals(Collections.singletonList(expected), actual);
}
Aggregations