use of io.cdap.cdap.etl.api.join.JoinField in project cdap by cdapio.
the class AutoJoinerTest method testInnerBetweenCondition.
@Test
public void testInnerBetweenCondition() throws Exception {
/*
users ----------|
|--> join --> sink
age_groups -----|
joinOn: users.age > age_groups.lo and (users.age <= age_groups.hi or age_groups.hi is null)
*/
Schema userSchema = Schema.recordOf("user", Schema.Field.of("name", Schema.of(Schema.Type.STRING)), Schema.Field.of("age", Schema.nullableOf(Schema.of(Schema.Type.INT))));
Schema ageGroupSchema = Schema.recordOf("age_group", Schema.Field.of("name", Schema.of(Schema.Type.STRING)), Schema.Field.of("lo", Schema.of(Schema.Type.INT)), Schema.Field.of("hi", Schema.nullableOf(Schema.of(Schema.Type.INT))));
Schema expectedSchema = Schema.recordOf("users.age_groups", Schema.Field.of("username", Schema.of(Schema.Type.STRING)), Schema.Field.of("age_group", Schema.of(Schema.Type.STRING)));
String userInput = UUID.randomUUID().toString();
String agesInput = UUID.randomUUID().toString();
String output = UUID.randomUUID().toString();
List<JoinField> select = new ArrayList<>();
select.add(new JoinField("users", "name", "username"));
select.add(new JoinField("age_groups", "name", "age_group"));
JoinCondition.OnExpression condition = JoinCondition.onExpression().setExpression("users.age >= age_groups.lo and (users.age < age_groups.hi or age_groups.hi is null)").build();
Map<String, String> joinerProperties = MockAutoJoiner.getProperties(Arrays.asList("users", "age_groups"), Collections.emptyList(), Arrays.asList("users", "age_groups"), Collections.emptyList(), select, false, null, condition);
ETLBatchConfig config = ETLBatchConfig.builder().addStage(new ETLStage("users", MockSource.getPlugin(userInput, userSchema))).addStage(new ETLStage("age_groups", MockSource.getPlugin(agesInput, ageGroupSchema))).addStage(new ETLStage("join", new ETLPlugin(MockAutoJoiner.NAME, BatchJoiner.PLUGIN_TYPE, joinerProperties))).addStage(new ETLStage("sink", MockSink.getPlugin(output))).addConnection("users", "join").addConnection("age_groups", "join").addConnection("join", "sink").setEngine(Engine.SPARK).build();
AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, config);
ApplicationId appId = NamespaceId.DEFAULT.app(UUID.randomUUID().toString());
ApplicationManager appManager = deployApplication(appId, appRequest);
List<StructuredRecord> records = new ArrayList<>();
records.add(StructuredRecord.builder(userSchema).set("name", "Alice").set("age", 35).build());
records.add(StructuredRecord.builder(userSchema).set("name", "Bob").build());
records.add(StructuredRecord.builder(userSchema).set("name", "Carl").set("age", 13).build());
records.add(StructuredRecord.builder(userSchema).set("name", "Dave").set("age", 0).build());
records.add(StructuredRecord.builder(userSchema).set("name", "Elaine").set("age", 68).build());
records.add(StructuredRecord.builder(userSchema).set("name", "Fred").set("age", 4).build());
DataSetManager<Table> inputManager = getDataset(userInput);
MockSource.writeInput(inputManager, records);
records.clear();
records.add(StructuredRecord.builder(ageGroupSchema).set("name", "infant").set("lo", 0).set("hi", 2).build());
records.add(StructuredRecord.builder(ageGroupSchema).set("name", "toddler").set("lo", 2).set("hi", 5).build());
records.add(StructuredRecord.builder(ageGroupSchema).set("name", "child").set("lo", 5).set("hi", 13).build());
records.add(StructuredRecord.builder(ageGroupSchema).set("name", "teen").set("lo", 13).set("hi", 20).build());
records.add(StructuredRecord.builder(ageGroupSchema).set("name", "adult").set("lo", 20).set("hi", 65).build());
records.add(StructuredRecord.builder(ageGroupSchema).set("name", "senior").set("lo", 65).build());
inputManager = getDataset(agesInput);
MockSource.writeInput(inputManager, records);
WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
workflowManager.startAndWaitForGoodRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
DataSetManager<Table> outputManager = getDataset(output);
List<StructuredRecord> outputRecords = MockSink.readOutput(outputManager);
Set<StructuredRecord> expected = new HashSet<>();
expected.add(StructuredRecord.builder(expectedSchema).set("username", "Alice").set("age_group", "adult").build());
expected.add(StructuredRecord.builder(expectedSchema).set("username", "Carl").set("age_group", "teen").build());
expected.add(StructuredRecord.builder(expectedSchema).set("username", "Dave").set("age_group", "infant").build());
expected.add(StructuredRecord.builder(expectedSchema).set("username", "Elaine").set("age_group", "senior").build());
expected.add(StructuredRecord.builder(expectedSchema).set("username", "Fred").set("age_group", "toddler").build());
Assert.assertEquals(expected, new HashSet<>(outputRecords));
validateMetric(6, appId, "users.records.out");
validateMetric(6, appId, "age_groups.records.out");
validateMetric(12, appId, "join.records.in");
validateMetric(expected.size(), appId, "join.records.out");
}
use of io.cdap.cdap.etl.api.join.JoinField in project cdap by cdapio.
the class AutoJoinerTest method testLeftOuterComplexConditionBroadcast.
@Test
public void testLeftOuterComplexConditionBroadcast() throws Exception {
/*
sales ----------|
|--> join --> sink
categories -----|
joinOn:
sales.price > 1000 and sales.date > 2020-01-01 and
(sales.category <=> categories.id or (sales.category is null and sales.department = categories.department))
*/
Schema salesSchema = Schema.recordOf("sale", Schema.Field.of("id", Schema.of(Schema.Type.INT)), Schema.Field.of("price", Schema.of(Schema.Type.DOUBLE)), Schema.Field.of("date", Schema.of(Schema.LogicalType.DATETIME)), Schema.Field.of("category", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("department", Schema.nullableOf(Schema.of(Schema.Type.STRING))));
Schema categorySchema = Schema.recordOf("category", Schema.Field.of("id", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("department", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("flag", Schema.nullableOf(Schema.of(Schema.Type.BOOLEAN))));
Schema expectedSchema = Schema.recordOf("sales.categories", Schema.Field.of("id", Schema.of(Schema.Type.INT)), Schema.Field.of("flag", Schema.nullableOf(Schema.of(Schema.Type.BOOLEAN))));
String salesInput = UUID.randomUUID().toString();
String categoriesInput = UUID.randomUUID().toString();
String output = UUID.randomUUID().toString();
List<JoinField> select = new ArrayList<>();
select.add(new JoinField("sales", "id"));
select.add(new JoinField("categories", "flag"));
/*
sales.price > 1000 and sales.date > 2020-01-01 and
(sales.category <=> categories.id or (sales.category is null and sales.department = categories.department))
*/
JoinCondition.OnExpression condition = JoinCondition.onExpression().addDatasetAlias("sales", "S").addDatasetAlias("categories", "C").setExpression("S.price > 1000 and S.date > '2020-01-01 00:00:00' and " + "(S.category = C.id or (S.category is null and S.department = C.department))").build();
Map<String, String> joinerProperties = MockAutoJoiner.getProperties(Arrays.asList("sales", "categories"), Collections.emptyList(), Collections.singletonList("sales"), Collections.singletonList("categories"), select, false, null, condition);
ETLBatchConfig config = ETLBatchConfig.builder().addStage(new ETLStage("sales", MockSource.getPlugin(salesInput, salesSchema))).addStage(new ETLStage("categories", MockSource.getPlugin(categoriesInput, categorySchema))).addStage(new ETLStage("join", new ETLPlugin(MockAutoJoiner.NAME, BatchJoiner.PLUGIN_TYPE, joinerProperties))).addStage(new ETLStage("sink", MockSink.getPlugin(output))).addConnection("sales", "join").addConnection("categories", "join").addConnection("join", "sink").setEngine(Engine.SPARK).build();
AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, config);
ApplicationId appId = NamespaceId.DEFAULT.app(UUID.randomUUID().toString());
ApplicationManager appManager = deployApplication(appId, appRequest);
List<StructuredRecord> records = new ArrayList<>();
records.add(StructuredRecord.builder(salesSchema).set("id", 0).set("price", 123.45d).set("date", "2021-01-01 00:00:00").set("category", "electronics").set("department", "entertainment").build());
records.add(StructuredRecord.builder(salesSchema).set("id", 1).set("price", 1000.01d).set("date", "2020-01-01 00:00:01").set("department", "home").build());
records.add(StructuredRecord.builder(salesSchema).set("id", 2).set("price", 5000d).set("date", "2021-01-01 00:00:00").set("category", "furniture").build());
records.add(StructuredRecord.builder(salesSchema).set("id", 3).set("price", 2000d).set("date", "2019-12-31 23:59:59").set("category", "furniture").build());
records.add(StructuredRecord.builder(salesSchema).set("id", 4).set("price", 2000d).set("date", "2020-01-01 12:00:00").set("category", "tv").set("department", "entertainment").build());
DataSetManager<Table> inputManager = getDataset(salesInput);
MockSource.writeInput(inputManager, records);
records.clear();
records.add(StructuredRecord.builder(categorySchema).set("id", "electronics").set("department", "entertainment").set("flag", false).build());
records.add(StructuredRecord.builder(categorySchema).set("id", "furniture").set("department", "home").set("flag", true).build());
records.add(StructuredRecord.builder(categorySchema).set("id", "tv").set("department", "entertainment").set("flag", false).build());
inputManager = getDataset(categoriesInput);
MockSource.writeInput(inputManager, records);
WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
workflowManager.startAndWaitForGoodRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
DataSetManager<Table> outputManager = getDataset(output);
List<StructuredRecord> outputRecords = MockSink.readOutput(outputManager);
Set<StructuredRecord> expected = new HashSet<>();
expected.add(StructuredRecord.builder(expectedSchema).set("id", 0).build());
expected.add(StructuredRecord.builder(expectedSchema).set("id", 1).set("flag", true).build());
expected.add(StructuredRecord.builder(expectedSchema).set("id", 2).set("flag", true).build());
expected.add(StructuredRecord.builder(expectedSchema).set("id", 3).build());
expected.add(StructuredRecord.builder(expectedSchema).set("id", 4).set("flag", false).build());
Assert.assertEquals(expected, new HashSet<>(outputRecords));
}
use of io.cdap.cdap.etl.api.join.JoinField in project hydrator-plugins by cdapio.
the class Joiner method createFieldOperations.
/**
* Create the field operations from the provided OutputFieldInfo instances and join keys. For join we record several
* types of transformation; Join, Identity, and Rename. For each of these transformations, if the input field is
* directly coming from the schema of one of the stage, the field is added as {@code stage_name.field_name}. We keep
* track of fields outputted by operation (in {@code outputsSoFar set}, so that any operation uses that field as input
* later, we add it without the stage name.
* <p>
* Join transform operation is added with join keys as input tagged with the stage name, and join keys without stage
* name as output.
* <p>
* For other fields which are not renamed in join, Identity transform is added, while for fields which are renamed
* Rename transform is added.
*
* @param outputFields collection of output fields along with information such as stage name, alias
* @param joinKeys join keys
* @return List of field operations
*/
@VisibleForTesting
static List<FieldOperation> createFieldOperations(List<JoinField> outputFields, Set<JoinKey> joinKeys) {
LinkedList<FieldOperation> operations = new LinkedList<>();
Map<String, List<String>> perStageJoinKeys = joinKeys.stream().collect(Collectors.toMap(JoinKey::getStageName, JoinKey::getFields));
// Add JOIN operation
List<String> joinInputs = new ArrayList<>();
Set<String> joinOutputs = new LinkedHashSet<>();
for (Map.Entry<String, List<String>> joinKey : perStageJoinKeys.entrySet()) {
for (String field : joinKey.getValue()) {
joinInputs.add(joinKey.getKey() + "." + field);
joinOutputs.add(field);
}
}
FieldOperation joinOperation = new FieldTransformOperation("Join", JOIN_OPERATION_DESCRIPTION, joinInputs, new ArrayList<>(joinOutputs));
operations.add(joinOperation);
Set<String> outputsSoFar = new HashSet<>(joinOutputs);
for (JoinField outputField : outputFields) {
// input field name for the operation will come in from schema if its not outputted so far
String stagedInputField = outputsSoFar.contains(outputField.getFieldName()) ? outputField.getFieldName() : outputField.getStageName() + "." + outputField.getFieldName();
String outputFieldName = outputField.getAlias() == null ? outputField.getFieldName() : outputField.getAlias();
if (outputField.getFieldName().equals(outputFieldName)) {
// Record identity transform when using key equality
List<String> stageJoinKeys = perStageJoinKeys.get(outputField.getStageName());
if (stageJoinKeys == null || stageJoinKeys.contains(outputField.getFieldName())) {
// by join
continue;
}
String operationName = String.format("Identity %s", stagedInputField);
FieldOperation identity = new FieldTransformOperation(operationName, IDENTITY_OPERATION_DESCRIPTION, Collections.singletonList(stagedInputField), outputFieldName);
operations.add(identity);
continue;
}
String operationName = String.format("Rename %s", stagedInputField);
FieldOperation transform = new FieldTransformOperation(operationName, RENAME_OPERATION_DESCRIPTION, Collections.singletonList(stagedInputField), outputFieldName);
operations.add(transform);
}
return operations;
}
use of io.cdap.cdap.etl.api.join.JoinField in project hydrator-plugins by cdapio.
the class JoinFieldLineageTest method testSimpleJoinWithRenameJoinKeys.
@Test
public void testSimpleJoinWithRenameJoinKeys() {
// customer -> (id, name)----------
// |
// JOIN ------->(id_from_customer, id_from_purchase, name, item)
// |
// purchase ->(customer_id, item)---
List<JoinField> outputFieldInfos = new ArrayList<>();
outputFieldInfos.add(new JoinField("customer", "id", "id_from_customer"));
outputFieldInfos.add(new JoinField("purchase", "customer_id", "id_from_purchase"));
Set<JoinKey> joinKeys = new HashSet<>();
joinKeys.add(new JoinKey("customer", Collections.singletonList("id")));
joinKeys.add(new JoinKey("purchase", Collections.singletonList("customer_id")));
List<FieldOperation> fieldOperations = Joiner.createFieldOperations(outputFieldInfos, joinKeys);
List<FieldTransformOperation> expected = new ArrayList<>();
expected.add(new FieldTransformOperation("Join", Joiner.JOIN_OPERATION_DESCRIPTION, Arrays.asList("customer.id", "purchase.customer_id"), Arrays.asList("id", "customer_id")));
expected.add(new FieldTransformOperation("Rename id", Joiner.RENAME_OPERATION_DESCRIPTION, Collections.singletonList("id"), Collections.singletonList("id_from_customer")));
expected.add(new FieldTransformOperation("Rename customer_id", Joiner.RENAME_OPERATION_DESCRIPTION, Collections.singletonList("customer_id"), Collections.singletonList("id_from_purchase")));
compareOperations(expected, fieldOperations);
}
use of io.cdap.cdap.etl.api.join.JoinField in project hydrator-plugins by cdapio.
the class JoinFieldLineageTest method testSimpleJoinWithRenameOnAdditionalFields.
@Test
public void testSimpleJoinWithRenameOnAdditionalFields() {
// customer -> (id, name)----------
// |
// JOIN --->(id_from_customer, customer_id, name_from_customer, item_from_purchase)
// |
// purchase ->(customer_id, item)---
List<JoinField> outputFieldInfos = new ArrayList<>();
outputFieldInfos.add(new JoinField("customer", "id", "id_from_customer"));
outputFieldInfos.add(new JoinField("customer", "name", "name_from_customer"));
outputFieldInfos.add(new JoinField("purchase", "customer_id", "customer_id"));
outputFieldInfos.add(new JoinField("purchase", "item", "item_from_purchase"));
Set<JoinKey> joinKeys = new HashSet<>();
joinKeys.add(new JoinKey("customer", Collections.singletonList("id")));
joinKeys.add(new JoinKey("purchase", Collections.singletonList("customer_id")));
List<FieldOperation> fieldOperations = Joiner.createFieldOperations(outputFieldInfos, joinKeys);
List<FieldTransformOperation> expected = new ArrayList<>();
expected.add(new FieldTransformOperation("Join", Joiner.JOIN_OPERATION_DESCRIPTION, Arrays.asList("customer.id", "purchase.customer_id"), Arrays.asList("id", "customer_id")));
expected.add(new FieldTransformOperation("Rename id", Joiner.RENAME_OPERATION_DESCRIPTION, Collections.singletonList("id"), Collections.singletonList("id_from_customer")));
expected.add(new FieldTransformOperation("Rename customer.name", Joiner.RENAME_OPERATION_DESCRIPTION, Collections.singletonList("customer.name"), Collections.singletonList("name_from_customer")));
expected.add(new FieldTransformOperation("Rename purchase.item", Joiner.RENAME_OPERATION_DESCRIPTION, Collections.singletonList("purchase.item"), Collections.singletonList("item_from_purchase")));
compareOperations(expected, fieldOperations);
}
Aggregations