use of io.cdap.cdap.api.data.format.StructuredRecord in project cdap by caskdata.
the class AutoJoinerTest method testSimpleAutoJoin.
private void testSimpleAutoJoin(List<String> required, List<String> broadcast, Set<StructuredRecord> expected, Engine engine) throws Exception {
/*
users ------|
|--> join --> sink
purchases --|
joinOn: users.region = purchases.region and users.user_id = purchases.user_id
*/
String userInput = UUID.randomUUID().toString();
String purchaseInput = UUID.randomUUID().toString();
String output = UUID.randomUUID().toString();
ETLBatchConfig config = ETLBatchConfig.builder().addStage(new ETLStage("users", MockSource.getPlugin(userInput, USER_SCHEMA))).addStage(new ETLStage("purchases", MockSource.getPlugin(purchaseInput, PURCHASE_SCHEMA))).addStage(new ETLStage("join", MockAutoJoiner.getPlugin(Arrays.asList("purchases", "users"), Arrays.asList("region", "user_id"), required, broadcast, Collections.emptyList(), true))).addStage(new ETLStage("sink", MockSink.getPlugin(output))).addConnection("users", "join").addConnection("purchases", "join").addConnection("join", "sink").setEngine(engine).build();
AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, config);
ApplicationId appId = NamespaceId.DEFAULT.app(UUID.randomUUID().toString());
ApplicationManager appManager = deployApplication(appId, appRequest);
// write input data
List<StructuredRecord> userData = Arrays.asList(USER_ALICE, USER_ALYCE, USER_BOB);
DataSetManager<Table> inputManager = getDataset(userInput);
MockSource.writeInput(inputManager, userData);
List<StructuredRecord> purchaseData = new ArrayList<>();
purchaseData.add(StructuredRecord.builder(PURCHASE_SCHEMA).set("region", "us").set("user_id", 0).set("purchase_id", 123).build());
purchaseData.add(StructuredRecord.builder(PURCHASE_SCHEMA).set("region", "us").set("user_id", 2).set("purchase_id", 456).build());
inputManager = getDataset(purchaseInput);
MockSource.writeInput(inputManager, purchaseData);
WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
Map<String, String> args = Collections.singletonMap(MockAutoJoiner.PARTITIONS_ARGUMENT, "1");
workflowManager.startAndWaitForGoodRun(args, ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
DataSetManager<Table> outputManager = getDataset(output);
List<StructuredRecord> outputRecords = MockSink.readOutput(outputManager);
Assert.assertEquals(expected, new HashSet<>(outputRecords));
validateMetric(5, appId, "join.records.in");
validateMetric(expected.size(), appId, "join.records.out");
if (engine != Engine.SPARK) {
// In SPARK number of partitions hint is ignored, so additional sinks are created
validateMetric(1, appId, "sink." + MockSink.INITIALIZED_COUNT_METRIC);
}
}
use of io.cdap.cdap.api.data.format.StructuredRecord in project cdap by caskdata.
the class AutoJoinerTest method testTripleAutoSingleRequiredJoin.
@Test
public void testTripleAutoSingleRequiredJoin() throws Exception {
Schema expectedSchema = Schema.recordOf("purchases.users.interests", Schema.Field.of("purchases_region", Schema.of(Schema.Type.STRING)), Schema.Field.of("purchases_purchase_id", Schema.of(Schema.Type.INT)), Schema.Field.of("purchases_user_id", Schema.of(Schema.Type.INT)), Schema.Field.of("users_region", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("users_user_id", Schema.nullableOf(Schema.of(Schema.Type.INT))), Schema.Field.of("users_name", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("interests_region", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("interests_user_id", Schema.nullableOf(Schema.of(Schema.Type.INT))), Schema.Field.of("interests_interest", Schema.nullableOf(Schema.of(Schema.Type.STRING))));
Set<StructuredRecord> expected = new HashSet<>();
expected.add(StructuredRecord.builder(expectedSchema).set("purchases_region", "us").set("purchases_purchase_id", 123).set("purchases_user_id", 0).set("users_region", "us").set("users_user_id", 0).set("users_name", "alice").set("interests_region", "us").set("interests_user_id", 0).set("interests_interest", "food").build());
expected.add(StructuredRecord.builder(expectedSchema).set("purchases_region", "us").set("purchases_purchase_id", 123).set("purchases_user_id", 0).set("users_region", "us").set("users_user_id", 0).set("users_name", "alice").set("interests_region", "us").set("interests_user_id", 0).set("interests_interest", "sports").build());
expected.add(StructuredRecord.builder(expectedSchema).set("purchases_region", "us").set("purchases_purchase_id", 456).set("purchases_user_id", 2).set("interests_region", "us").set("interests_user_id", 2).set("interests_interest", "gaming").build());
// First required : all left joins
testTripleAutoJoin(Collections.singletonList("purchases"), expected, Engine.SPARK, Arrays.asList("purchases", "users", "interests"));
testTripleAutoJoin(Collections.singletonList("purchases"), expected, Engine.MAPREDUCE, Collections.emptyList());
// Middle Required : right join + left join.
testTripleAutoJoin(Collections.singletonList("purchases"), expected, Engine.SPARK, Arrays.asList("users", "purchases", "interests"));
testTripleAutoJoin(Collections.singletonList("purchases"), expected, Engine.MAPREDUCE, Arrays.asList("users", "purchases", "interests"));
// Last Required : outer + left join with coalesce from first 2
testTripleAutoJoin(Collections.singletonList("purchases"), expected, Engine.SPARK, Arrays.asList("users", "interests", "purchases"));
}
use of io.cdap.cdap.api.data.format.StructuredRecord in project cdap by caskdata.
the class AutoJoinerTest method testAutoInnerJoinUsingSQLEngine.
@Test
public void testAutoInnerJoinUsingSQLEngine() throws Exception {
Schema expectedSchema = Schema.recordOf("purchases.users", Schema.Field.of("purchases_region", Schema.of(Schema.Type.STRING)), Schema.Field.of("purchases_purchase_id", Schema.of(Schema.Type.INT)), Schema.Field.of("purchases_user_id", Schema.of(Schema.Type.INT)), Schema.Field.of("users_region", Schema.of(Schema.Type.STRING)), Schema.Field.of("users_user_id", Schema.of(Schema.Type.INT)), Schema.Field.of("users_name", Schema.of(Schema.Type.STRING)));
Set<StructuredRecord> expected = new HashSet<>();
expected.add(StructuredRecord.builder(expectedSchema).set("purchases_region", "us").set("purchases_purchase_id", 123).set("purchases_user_id", 0).set("users_region", "us").set("users_user_id", 0).set("users_name", "alice").build());
testSimpleAutoJoinUsingSQLEngine(Arrays.asList("users", "purchases"), Collections.emptyList(), expected, expectedSchema, Engine.SPARK);
testSimpleAutoJoinUsingSQLEngineWithCapabilities(Arrays.asList("users", "purchases"), Collections.emptyList(), expected, expectedSchema, Engine.SPARK);
}
use of io.cdap.cdap.api.data.format.StructuredRecord in project cdap by caskdata.
the class AutoJoinerTest method testLeftOuterComplexConditionBroadcast.
@Test
public void testLeftOuterComplexConditionBroadcast() throws Exception {
/*
sales ----------|
|--> join --> sink
categories -----|
joinOn:
sales.price > 1000 and sales.date > 2020-01-01 and
(sales.category <=> categories.id or (sales.category is null and sales.department = categories.department))
*/
Schema salesSchema = Schema.recordOf("sale", Schema.Field.of("id", Schema.of(Schema.Type.INT)), Schema.Field.of("price", Schema.of(Schema.Type.DOUBLE)), Schema.Field.of("date", Schema.of(Schema.LogicalType.DATETIME)), Schema.Field.of("category", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("department", Schema.nullableOf(Schema.of(Schema.Type.STRING))));
Schema categorySchema = Schema.recordOf("category", Schema.Field.of("id", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("department", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("flag", Schema.nullableOf(Schema.of(Schema.Type.BOOLEAN))));
Schema expectedSchema = Schema.recordOf("sales.categories", Schema.Field.of("id", Schema.of(Schema.Type.INT)), Schema.Field.of("flag", Schema.nullableOf(Schema.of(Schema.Type.BOOLEAN))));
String salesInput = UUID.randomUUID().toString();
String categoriesInput = UUID.randomUUID().toString();
String output = UUID.randomUUID().toString();
List<JoinField> select = new ArrayList<>();
select.add(new JoinField("sales", "id"));
select.add(new JoinField("categories", "flag"));
/*
sales.price > 1000 and sales.date > 2020-01-01 and
(sales.category <=> categories.id or (sales.category is null and sales.department = categories.department))
*/
JoinCondition.OnExpression condition = JoinCondition.onExpression().addDatasetAlias("sales", "S").addDatasetAlias("categories", "C").setExpression("S.price > 1000 and S.date > '2020-01-01 00:00:00' and " + "(S.category = C.id or (S.category is null and S.department = C.department))").build();
Map<String, String> joinerProperties = MockAutoJoiner.getProperties(Arrays.asList("sales", "categories"), Collections.emptyList(), Collections.singletonList("sales"), Collections.singletonList("categories"), select, false, null, condition);
ETLBatchConfig config = ETLBatchConfig.builder().addStage(new ETLStage("sales", MockSource.getPlugin(salesInput, salesSchema))).addStage(new ETLStage("categories", MockSource.getPlugin(categoriesInput, categorySchema))).addStage(new ETLStage("join", new ETLPlugin(MockAutoJoiner.NAME, BatchJoiner.PLUGIN_TYPE, joinerProperties))).addStage(new ETLStage("sink", MockSink.getPlugin(output))).addConnection("sales", "join").addConnection("categories", "join").addConnection("join", "sink").setEngine(Engine.SPARK).build();
AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, config);
ApplicationId appId = NamespaceId.DEFAULT.app(UUID.randomUUID().toString());
ApplicationManager appManager = deployApplication(appId, appRequest);
List<StructuredRecord> records = new ArrayList<>();
records.add(StructuredRecord.builder(salesSchema).set("id", 0).set("price", 123.45d).set("date", "2021-01-01 00:00:00").set("category", "electronics").set("department", "entertainment").build());
records.add(StructuredRecord.builder(salesSchema).set("id", 1).set("price", 1000.01d).set("date", "2020-01-01 00:00:01").set("department", "home").build());
records.add(StructuredRecord.builder(salesSchema).set("id", 2).set("price", 5000d).set("date", "2021-01-01 00:00:00").set("category", "furniture").build());
records.add(StructuredRecord.builder(salesSchema).set("id", 3).set("price", 2000d).set("date", "2019-12-31 23:59:59").set("category", "furniture").build());
records.add(StructuredRecord.builder(salesSchema).set("id", 4).set("price", 2000d).set("date", "2020-01-01 12:00:00").set("category", "tv").set("department", "entertainment").build());
DataSetManager<Table> inputManager = getDataset(salesInput);
MockSource.writeInput(inputManager, records);
records.clear();
records.add(StructuredRecord.builder(categorySchema).set("id", "electronics").set("department", "entertainment").set("flag", false).build());
records.add(StructuredRecord.builder(categorySchema).set("id", "furniture").set("department", "home").set("flag", true).build());
records.add(StructuredRecord.builder(categorySchema).set("id", "tv").set("department", "entertainment").set("flag", false).build());
inputManager = getDataset(categoriesInput);
MockSource.writeInput(inputManager, records);
WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
workflowManager.startAndWaitForGoodRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
DataSetManager<Table> outputManager = getDataset(output);
List<StructuredRecord> outputRecords = MockSink.readOutput(outputManager);
Set<StructuredRecord> expected = new HashSet<>();
expected.add(StructuredRecord.builder(expectedSchema).set("id", 0).build());
expected.add(StructuredRecord.builder(expectedSchema).set("id", 1).set("flag", true).build());
expected.add(StructuredRecord.builder(expectedSchema).set("id", 2).set("flag", true).build());
expected.add(StructuredRecord.builder(expectedSchema).set("id", 3).build());
expected.add(StructuredRecord.builder(expectedSchema).set("id", 4).set("flag", false).build());
Assert.assertEquals(expected, new HashSet<>(outputRecords));
}
use of io.cdap.cdap.api.data.format.StructuredRecord in project cdap by caskdata.
the class AutoJoinerTest method testNullEquality.
private void testNullEquality(Engine engine, boolean nullIsEqual, Set<StructuredRecord> expected) throws Exception {
Schema itemSchema = Schema.recordOf("item", Schema.Field.of("id", Schema.nullableOf(Schema.of(Schema.Type.INT))), Schema.Field.of("region", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("name", Schema.nullableOf(Schema.of(Schema.Type.STRING))));
Schema attributeSchema = Schema.recordOf("attribute", Schema.Field.of("region", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("id", Schema.nullableOf(Schema.of(Schema.Type.INT))), Schema.Field.of("attr", Schema.nullableOf(Schema.of(Schema.Type.STRING))));
/*
items -------|
|--> join --> sink
attributes --|
joinOn: items.region = attributes.region and items.id = attributes.id
*/
String itemsInput = UUID.randomUUID().toString();
String attributesInput = UUID.randomUUID().toString();
String output = UUID.randomUUID().toString();
ETLBatchConfig config = ETLBatchConfig.builder().addStage(new ETLStage("items", MockSource.getPlugin(itemsInput, itemSchema))).addStage(new ETLStage("attributes", MockSource.getPlugin(attributesInput, attributeSchema))).addStage(new ETLStage("join", MockAutoJoiner.getPlugin(Arrays.asList("items", "attributes"), Arrays.asList("region", "id"), Collections.singletonList("items"), Collections.emptyList(), Collections.emptyList(), nullIsEqual))).addStage(new ETLStage("sink", MockSink.getPlugin(output))).addConnection("items", "join").addConnection("attributes", "join").addConnection("join", "sink").setEngine(engine).build();
AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, config);
ApplicationId appId = NamespaceId.DEFAULT.app(UUID.randomUUID().toString());
ApplicationManager appManager = deployApplication(appId, appRequest);
// write input data
List<StructuredRecord> itemData = new ArrayList<>();
itemData.add(StructuredRecord.builder(itemSchema).set("region", "us").set("id", 0).set("name", "bacon").build());
itemData.add(StructuredRecord.builder(itemSchema).set("id", 1).build());
itemData.add(StructuredRecord.builder(itemSchema).set("region", "us").build());
DataSetManager<Table> inputManager = getDataset(itemsInput);
MockSource.writeInput(inputManager, itemData);
List<StructuredRecord> attributesData = new ArrayList<>();
attributesData.add(StructuredRecord.builder(attributeSchema).set("region", "us").set("id", 0).set("attr", "food").build());
attributesData.add(StructuredRecord.builder(attributeSchema).set("id", 1).set("attr", "car").build());
attributesData.add(StructuredRecord.builder(attributeSchema).set("region", "us").build());
inputManager = getDataset(attributesInput);
MockSource.writeInput(inputManager, attributesData);
WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
workflowManager.startAndWaitForGoodRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
DataSetManager<Table> outputManager = getDataset(output);
List<StructuredRecord> outputRecords = MockSink.readOutput(outputManager);
Assert.assertEquals(expected, new HashSet<>(outputRecords));
}
Aggregations