Search in sources :

Example 11 with JoinField

use of io.cdap.cdap.etl.api.join.JoinField in project cdap by cdapio.

the class AutoJoinerTest method testInnerBetweenCondition.

@Test
public void testInnerBetweenCondition() throws Exception {
    /*
         users ----------|
                         |--> join --> sink
         age_groups -----|

         joinOn: users.age > age_groups.lo and (users.age <= age_groups.hi or age_groups.hi is null)
     */
    Schema userSchema = Schema.recordOf("user", Schema.Field.of("name", Schema.of(Schema.Type.STRING)), Schema.Field.of("age", Schema.nullableOf(Schema.of(Schema.Type.INT))));
    Schema ageGroupSchema = Schema.recordOf("age_group", Schema.Field.of("name", Schema.of(Schema.Type.STRING)), Schema.Field.of("lo", Schema.of(Schema.Type.INT)), Schema.Field.of("hi", Schema.nullableOf(Schema.of(Schema.Type.INT))));
    Schema expectedSchema = Schema.recordOf("users.age_groups", Schema.Field.of("username", Schema.of(Schema.Type.STRING)), Schema.Field.of("age_group", Schema.of(Schema.Type.STRING)));
    String userInput = UUID.randomUUID().toString();
    String agesInput = UUID.randomUUID().toString();
    String output = UUID.randomUUID().toString();
    List<JoinField> select = new ArrayList<>();
    select.add(new JoinField("users", "name", "username"));
    select.add(new JoinField("age_groups", "name", "age_group"));
    JoinCondition.OnExpression condition = JoinCondition.onExpression().setExpression("users.age >= age_groups.lo and (users.age < age_groups.hi or age_groups.hi is null)").build();
    Map<String, String> joinerProperties = MockAutoJoiner.getProperties(Arrays.asList("users", "age_groups"), Collections.emptyList(), Arrays.asList("users", "age_groups"), Collections.emptyList(), select, false, null, condition);
    ETLBatchConfig config = ETLBatchConfig.builder().addStage(new ETLStage("users", MockSource.getPlugin(userInput, userSchema))).addStage(new ETLStage("age_groups", MockSource.getPlugin(agesInput, ageGroupSchema))).addStage(new ETLStage("join", new ETLPlugin(MockAutoJoiner.NAME, BatchJoiner.PLUGIN_TYPE, joinerProperties))).addStage(new ETLStage("sink", MockSink.getPlugin(output))).addConnection("users", "join").addConnection("age_groups", "join").addConnection("join", "sink").setEngine(Engine.SPARK).build();
    AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, config);
    ApplicationId appId = NamespaceId.DEFAULT.app(UUID.randomUUID().toString());
    ApplicationManager appManager = deployApplication(appId, appRequest);
    List<StructuredRecord> records = new ArrayList<>();
    records.add(StructuredRecord.builder(userSchema).set("name", "Alice").set("age", 35).build());
    records.add(StructuredRecord.builder(userSchema).set("name", "Bob").build());
    records.add(StructuredRecord.builder(userSchema).set("name", "Carl").set("age", 13).build());
    records.add(StructuredRecord.builder(userSchema).set("name", "Dave").set("age", 0).build());
    records.add(StructuredRecord.builder(userSchema).set("name", "Elaine").set("age", 68).build());
    records.add(StructuredRecord.builder(userSchema).set("name", "Fred").set("age", 4).build());
    DataSetManager<Table> inputManager = getDataset(userInput);
    MockSource.writeInput(inputManager, records);
    records.clear();
    records.add(StructuredRecord.builder(ageGroupSchema).set("name", "infant").set("lo", 0).set("hi", 2).build());
    records.add(StructuredRecord.builder(ageGroupSchema).set("name", "toddler").set("lo", 2).set("hi", 5).build());
    records.add(StructuredRecord.builder(ageGroupSchema).set("name", "child").set("lo", 5).set("hi", 13).build());
    records.add(StructuredRecord.builder(ageGroupSchema).set("name", "teen").set("lo", 13).set("hi", 20).build());
    records.add(StructuredRecord.builder(ageGroupSchema).set("name", "adult").set("lo", 20).set("hi", 65).build());
    records.add(StructuredRecord.builder(ageGroupSchema).set("name", "senior").set("lo", 65).build());
    inputManager = getDataset(agesInput);
    MockSource.writeInput(inputManager, records);
    WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
    workflowManager.startAndWaitForGoodRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
    DataSetManager<Table> outputManager = getDataset(output);
    List<StructuredRecord> outputRecords = MockSink.readOutput(outputManager);
    Set<StructuredRecord> expected = new HashSet<>();
    expected.add(StructuredRecord.builder(expectedSchema).set("username", "Alice").set("age_group", "adult").build());
    expected.add(StructuredRecord.builder(expectedSchema).set("username", "Carl").set("age_group", "teen").build());
    expected.add(StructuredRecord.builder(expectedSchema).set("username", "Dave").set("age_group", "infant").build());
    expected.add(StructuredRecord.builder(expectedSchema).set("username", "Elaine").set("age_group", "senior").build());
    expected.add(StructuredRecord.builder(expectedSchema).set("username", "Fred").set("age_group", "toddler").build());
    Assert.assertEquals(expected, new HashSet<>(outputRecords));
    validateMetric(6, appId, "users.records.out");
    validateMetric(6, appId, "age_groups.records.out");
    validateMetric(12, appId, "join.records.in");
    validateMetric(expected.size(), appId, "join.records.out");
}
Also used : ApplicationManager(io.cdap.cdap.test.ApplicationManager) Table(io.cdap.cdap.api.dataset.table.Table) Schema(io.cdap.cdap.api.data.schema.Schema) WorkflowManager(io.cdap.cdap.test.WorkflowManager) ArrayList(java.util.ArrayList) JoinField(io.cdap.cdap.etl.api.join.JoinField) ETLPlugin(io.cdap.cdap.etl.proto.v2.ETLPlugin) StructuredRecord(io.cdap.cdap.api.data.format.StructuredRecord) JoinCondition(io.cdap.cdap.etl.api.join.JoinCondition) AppRequest(io.cdap.cdap.proto.artifact.AppRequest) ETLBatchConfig(io.cdap.cdap.etl.proto.v2.ETLBatchConfig) ETLStage(io.cdap.cdap.etl.proto.v2.ETLStage) ApplicationId(io.cdap.cdap.proto.id.ApplicationId) HashSet(java.util.HashSet) Test(org.junit.Test)

Example 12 with JoinField

use of io.cdap.cdap.etl.api.join.JoinField in project cdap by cdapio.

the class AutoJoinerTest method testLeftOuterComplexConditionBroadcast.

@Test
public void testLeftOuterComplexConditionBroadcast() throws Exception {
    /*
         sales ----------|
                         |--> join --> sink
         categories -----|

         joinOn:
           sales.price > 1000 and sales.date > 2020-01-01 and
           (sales.category <=> categories.id or (sales.category is null and sales.department = categories.department))
     */
    Schema salesSchema = Schema.recordOf("sale", Schema.Field.of("id", Schema.of(Schema.Type.INT)), Schema.Field.of("price", Schema.of(Schema.Type.DOUBLE)), Schema.Field.of("date", Schema.of(Schema.LogicalType.DATETIME)), Schema.Field.of("category", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("department", Schema.nullableOf(Schema.of(Schema.Type.STRING))));
    Schema categorySchema = Schema.recordOf("category", Schema.Field.of("id", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("department", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("flag", Schema.nullableOf(Schema.of(Schema.Type.BOOLEAN))));
    Schema expectedSchema = Schema.recordOf("sales.categories", Schema.Field.of("id", Schema.of(Schema.Type.INT)), Schema.Field.of("flag", Schema.nullableOf(Schema.of(Schema.Type.BOOLEAN))));
    String salesInput = UUID.randomUUID().toString();
    String categoriesInput = UUID.randomUUID().toString();
    String output = UUID.randomUUID().toString();
    List<JoinField> select = new ArrayList<>();
    select.add(new JoinField("sales", "id"));
    select.add(new JoinField("categories", "flag"));
    /*
           sales.price > 1000 and sales.date > 2020-01-01 and
           (sales.category <=> categories.id or (sales.category is null and sales.department = categories.department))
     */
    JoinCondition.OnExpression condition = JoinCondition.onExpression().addDatasetAlias("sales", "S").addDatasetAlias("categories", "C").setExpression("S.price > 1000 and S.date > '2020-01-01 00:00:00' and " + "(S.category = C.id or (S.category is null and S.department = C.department))").build();
    Map<String, String> joinerProperties = MockAutoJoiner.getProperties(Arrays.asList("sales", "categories"), Collections.emptyList(), Collections.singletonList("sales"), Collections.singletonList("categories"), select, false, null, condition);
    ETLBatchConfig config = ETLBatchConfig.builder().addStage(new ETLStage("sales", MockSource.getPlugin(salesInput, salesSchema))).addStage(new ETLStage("categories", MockSource.getPlugin(categoriesInput, categorySchema))).addStage(new ETLStage("join", new ETLPlugin(MockAutoJoiner.NAME, BatchJoiner.PLUGIN_TYPE, joinerProperties))).addStage(new ETLStage("sink", MockSink.getPlugin(output))).addConnection("sales", "join").addConnection("categories", "join").addConnection("join", "sink").setEngine(Engine.SPARK).build();
    AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, config);
    ApplicationId appId = NamespaceId.DEFAULT.app(UUID.randomUUID().toString());
    ApplicationManager appManager = deployApplication(appId, appRequest);
    List<StructuredRecord> records = new ArrayList<>();
    records.add(StructuredRecord.builder(salesSchema).set("id", 0).set("price", 123.45d).set("date", "2021-01-01 00:00:00").set("category", "electronics").set("department", "entertainment").build());
    records.add(StructuredRecord.builder(salesSchema).set("id", 1).set("price", 1000.01d).set("date", "2020-01-01 00:00:01").set("department", "home").build());
    records.add(StructuredRecord.builder(salesSchema).set("id", 2).set("price", 5000d).set("date", "2021-01-01 00:00:00").set("category", "furniture").build());
    records.add(StructuredRecord.builder(salesSchema).set("id", 3).set("price", 2000d).set("date", "2019-12-31 23:59:59").set("category", "furniture").build());
    records.add(StructuredRecord.builder(salesSchema).set("id", 4).set("price", 2000d).set("date", "2020-01-01 12:00:00").set("category", "tv").set("department", "entertainment").build());
    DataSetManager<Table> inputManager = getDataset(salesInput);
    MockSource.writeInput(inputManager, records);
    records.clear();
    records.add(StructuredRecord.builder(categorySchema).set("id", "electronics").set("department", "entertainment").set("flag", false).build());
    records.add(StructuredRecord.builder(categorySchema).set("id", "furniture").set("department", "home").set("flag", true).build());
    records.add(StructuredRecord.builder(categorySchema).set("id", "tv").set("department", "entertainment").set("flag", false).build());
    inputManager = getDataset(categoriesInput);
    MockSource.writeInput(inputManager, records);
    WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
    workflowManager.startAndWaitForGoodRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
    DataSetManager<Table> outputManager = getDataset(output);
    List<StructuredRecord> outputRecords = MockSink.readOutput(outputManager);
    Set<StructuredRecord> expected = new HashSet<>();
    expected.add(StructuredRecord.builder(expectedSchema).set("id", 0).build());
    expected.add(StructuredRecord.builder(expectedSchema).set("id", 1).set("flag", true).build());
    expected.add(StructuredRecord.builder(expectedSchema).set("id", 2).set("flag", true).build());
    expected.add(StructuredRecord.builder(expectedSchema).set("id", 3).build());
    expected.add(StructuredRecord.builder(expectedSchema).set("id", 4).set("flag", false).build());
    Assert.assertEquals(expected, new HashSet<>(outputRecords));
}
Also used : ApplicationManager(io.cdap.cdap.test.ApplicationManager) Table(io.cdap.cdap.api.dataset.table.Table) Schema(io.cdap.cdap.api.data.schema.Schema) WorkflowManager(io.cdap.cdap.test.WorkflowManager) ArrayList(java.util.ArrayList) JoinField(io.cdap.cdap.etl.api.join.JoinField) ETLPlugin(io.cdap.cdap.etl.proto.v2.ETLPlugin) StructuredRecord(io.cdap.cdap.api.data.format.StructuredRecord) JoinCondition(io.cdap.cdap.etl.api.join.JoinCondition) AppRequest(io.cdap.cdap.proto.artifact.AppRequest) ETLBatchConfig(io.cdap.cdap.etl.proto.v2.ETLBatchConfig) ETLStage(io.cdap.cdap.etl.proto.v2.ETLStage) ApplicationId(io.cdap.cdap.proto.id.ApplicationId) HashSet(java.util.HashSet) Test(org.junit.Test)

Example 13 with JoinField

use of io.cdap.cdap.etl.api.join.JoinField in project hydrator-plugins by cdapio.

the class Joiner method createFieldOperations.

/**
 * Create the field operations from the provided OutputFieldInfo instances and join keys. For join we record several
 * types of transformation; Join, Identity, and Rename. For each of these transformations, if the input field is
 * directly coming from the schema of one of the stage, the field is added as {@code stage_name.field_name}. We keep
 * track of fields outputted by operation (in {@code outputsSoFar set}, so that any operation uses that field as input
 * later, we add it without the stage name.
 * <p>
 * Join transform operation is added with join keys as input tagged with the stage name, and join keys without stage
 * name as output.
 * <p>
 * For other fields which are not renamed in join, Identity transform is added, while for fields which are renamed
 * Rename transform is added.
 *
 * @param outputFields collection of output fields along with information such as stage name, alias
 * @param joinKeys join keys
 * @return List of field operations
 */
@VisibleForTesting
static List<FieldOperation> createFieldOperations(List<JoinField> outputFields, Set<JoinKey> joinKeys) {
    LinkedList<FieldOperation> operations = new LinkedList<>();
    Map<String, List<String>> perStageJoinKeys = joinKeys.stream().collect(Collectors.toMap(JoinKey::getStageName, JoinKey::getFields));
    // Add JOIN operation
    List<String> joinInputs = new ArrayList<>();
    Set<String> joinOutputs = new LinkedHashSet<>();
    for (Map.Entry<String, List<String>> joinKey : perStageJoinKeys.entrySet()) {
        for (String field : joinKey.getValue()) {
            joinInputs.add(joinKey.getKey() + "." + field);
            joinOutputs.add(field);
        }
    }
    FieldOperation joinOperation = new FieldTransformOperation("Join", JOIN_OPERATION_DESCRIPTION, joinInputs, new ArrayList<>(joinOutputs));
    operations.add(joinOperation);
    Set<String> outputsSoFar = new HashSet<>(joinOutputs);
    for (JoinField outputField : outputFields) {
        // input field name for the operation will come in from schema if its not outputted so far
        String stagedInputField = outputsSoFar.contains(outputField.getFieldName()) ? outputField.getFieldName() : outputField.getStageName() + "." + outputField.getFieldName();
        String outputFieldName = outputField.getAlias() == null ? outputField.getFieldName() : outputField.getAlias();
        if (outputField.getFieldName().equals(outputFieldName)) {
            // Record identity transform when using key equality
            List<String> stageJoinKeys = perStageJoinKeys.get(outputField.getStageName());
            if (stageJoinKeys == null || stageJoinKeys.contains(outputField.getFieldName())) {
                // by join
                continue;
            }
            String operationName = String.format("Identity %s", stagedInputField);
            FieldOperation identity = new FieldTransformOperation(operationName, IDENTITY_OPERATION_DESCRIPTION, Collections.singletonList(stagedInputField), outputFieldName);
            operations.add(identity);
            continue;
        }
        String operationName = String.format("Rename %s", stagedInputField);
        FieldOperation transform = new FieldTransformOperation(operationName, RENAME_OPERATION_DESCRIPTION, Collections.singletonList(stagedInputField), outputFieldName);
        operations.add(transform);
    }
    return operations;
}
Also used : LinkedHashSet(java.util.LinkedHashSet) ArrayList(java.util.ArrayList) JoinField(io.cdap.cdap.etl.api.join.JoinField) LinkedList(java.util.LinkedList) FieldTransformOperation(io.cdap.cdap.etl.api.lineage.field.FieldTransformOperation) ArrayList(java.util.ArrayList) LinkedList(java.util.LinkedList) List(java.util.List) FieldOperation(io.cdap.cdap.etl.api.lineage.field.FieldOperation) Map(java.util.Map) HashSet(java.util.HashSet) LinkedHashSet(java.util.LinkedHashSet) VisibleForTesting(com.google.common.annotations.VisibleForTesting)

Example 14 with JoinField

use of io.cdap.cdap.etl.api.join.JoinField in project hydrator-plugins by cdapio.

the class JoinFieldLineageTest method testSimpleJoinWithRenameJoinKeys.

@Test
public void testSimpleJoinWithRenameJoinKeys() {
    // customer -> (id, name)----------
    // |
    // JOIN  ------->(id_from_customer, id_from_purchase, name, item)
    // |
    // purchase ->(customer_id, item)---
    List<JoinField> outputFieldInfos = new ArrayList<>();
    outputFieldInfos.add(new JoinField("customer", "id", "id_from_customer"));
    outputFieldInfos.add(new JoinField("purchase", "customer_id", "id_from_purchase"));
    Set<JoinKey> joinKeys = new HashSet<>();
    joinKeys.add(new JoinKey("customer", Collections.singletonList("id")));
    joinKeys.add(new JoinKey("purchase", Collections.singletonList("customer_id")));
    List<FieldOperation> fieldOperations = Joiner.createFieldOperations(outputFieldInfos, joinKeys);
    List<FieldTransformOperation> expected = new ArrayList<>();
    expected.add(new FieldTransformOperation("Join", Joiner.JOIN_OPERATION_DESCRIPTION, Arrays.asList("customer.id", "purchase.customer_id"), Arrays.asList("id", "customer_id")));
    expected.add(new FieldTransformOperation("Rename id", Joiner.RENAME_OPERATION_DESCRIPTION, Collections.singletonList("id"), Collections.singletonList("id_from_customer")));
    expected.add(new FieldTransformOperation("Rename customer_id", Joiner.RENAME_OPERATION_DESCRIPTION, Collections.singletonList("customer_id"), Collections.singletonList("id_from_purchase")));
    compareOperations(expected, fieldOperations);
}
Also used : JoinKey(io.cdap.cdap.etl.api.join.JoinKey) ArrayList(java.util.ArrayList) FieldTransformOperation(io.cdap.cdap.etl.api.lineage.field.FieldTransformOperation) JoinField(io.cdap.cdap.etl.api.join.JoinField) FieldOperation(io.cdap.cdap.etl.api.lineage.field.FieldOperation) HashSet(java.util.HashSet) Test(org.junit.Test)

Example 15 with JoinField

use of io.cdap.cdap.etl.api.join.JoinField in project hydrator-plugins by cdapio.

the class JoinFieldLineageTest method testSimpleJoinWithRenameOnAdditionalFields.

@Test
public void testSimpleJoinWithRenameOnAdditionalFields() {
    // customer -> (id, name)----------
    // |
    // JOIN  --->(id_from_customer, customer_id, name_from_customer, item_from_purchase)
    // |
    // purchase ->(customer_id, item)---
    List<JoinField> outputFieldInfos = new ArrayList<>();
    outputFieldInfos.add(new JoinField("customer", "id", "id_from_customer"));
    outputFieldInfos.add(new JoinField("customer", "name", "name_from_customer"));
    outputFieldInfos.add(new JoinField("purchase", "customer_id", "customer_id"));
    outputFieldInfos.add(new JoinField("purchase", "item", "item_from_purchase"));
    Set<JoinKey> joinKeys = new HashSet<>();
    joinKeys.add(new JoinKey("customer", Collections.singletonList("id")));
    joinKeys.add(new JoinKey("purchase", Collections.singletonList("customer_id")));
    List<FieldOperation> fieldOperations = Joiner.createFieldOperations(outputFieldInfos, joinKeys);
    List<FieldTransformOperation> expected = new ArrayList<>();
    expected.add(new FieldTransformOperation("Join", Joiner.JOIN_OPERATION_DESCRIPTION, Arrays.asList("customer.id", "purchase.customer_id"), Arrays.asList("id", "customer_id")));
    expected.add(new FieldTransformOperation("Rename id", Joiner.RENAME_OPERATION_DESCRIPTION, Collections.singletonList("id"), Collections.singletonList("id_from_customer")));
    expected.add(new FieldTransformOperation("Rename customer.name", Joiner.RENAME_OPERATION_DESCRIPTION, Collections.singletonList("customer.name"), Collections.singletonList("name_from_customer")));
    expected.add(new FieldTransformOperation("Rename purchase.item", Joiner.RENAME_OPERATION_DESCRIPTION, Collections.singletonList("purchase.item"), Collections.singletonList("item_from_purchase")));
    compareOperations(expected, fieldOperations);
}
Also used : JoinKey(io.cdap.cdap.etl.api.join.JoinKey) ArrayList(java.util.ArrayList) FieldTransformOperation(io.cdap.cdap.etl.api.lineage.field.FieldTransformOperation) JoinField(io.cdap.cdap.etl.api.join.JoinField) FieldOperation(io.cdap.cdap.etl.api.lineage.field.FieldOperation) HashSet(java.util.HashSet) Test(org.junit.Test)

Aggregations

JoinField (io.cdap.cdap.etl.api.join.JoinField)40 ArrayList (java.util.ArrayList)30 Schema (io.cdap.cdap.api.data.schema.Schema)20 HashSet (java.util.HashSet)19 Test (org.junit.Test)17 JoinCondition (io.cdap.cdap.etl.api.join.JoinCondition)15 StructuredRecord (io.cdap.cdap.api.data.format.StructuredRecord)14 ETLBatchConfig (io.cdap.cdap.etl.proto.v2.ETLBatchConfig)12 ETLStage (io.cdap.cdap.etl.proto.v2.ETLStage)12 HashMap (java.util.HashMap)12 JoinKey (io.cdap.cdap.etl.api.join.JoinKey)10 Table (io.cdap.cdap.api.dataset.table.Table)8 ETLPlugin (io.cdap.cdap.etl.proto.v2.ETLPlugin)8 AppRequest (io.cdap.cdap.proto.artifact.AppRequest)8 ApplicationId (io.cdap.cdap.proto.id.ApplicationId)8 ApplicationManager (io.cdap.cdap.test.ApplicationManager)8 WorkflowManager (io.cdap.cdap.test.WorkflowManager)8 List (java.util.List)8 FieldOperation (io.cdap.cdap.etl.api.lineage.field.FieldOperation)7 FieldTransformOperation (io.cdap.cdap.etl.api.lineage.field.FieldTransformOperation)7