Search in sources :

Example 1 with JoinField

use of io.cdap.cdap.etl.api.join.JoinField in project cdap by caskdata.

the class BaseRDDCollection method getSQL.

static String getSQL(JoinExpressionRequest join) {
    JoinCondition.OnExpression condition = join.getCondition();
    Map<String, String> datasetAliases = condition.getDatasetAliases();
    String leftName = join.getLeft().getStage();
    String leftAlias = datasetAliases.getOrDefault(leftName, leftName);
    String rightName = join.getRight().getStage();
    String rightAlias = datasetAliases.getOrDefault(rightName, rightName);
    StringBuilder query = new StringBuilder("SELECT ");
    // see https://spark.apache.org/docs/3.0.0/sql-ref-syntax-qry-select-hints.html for more info on join hints
    if (join.getLeft().isBroadcast() && join.getRight().isBroadcast()) {
        query.append("/*+ BROADCAST(").append(leftAlias).append("), BROADCAST(").append(rightAlias).append(") */ ");
    } else if (join.getLeft().isBroadcast()) {
        query.append("/*+ BROADCAST(").append(leftAlias).append(") */ ");
    } else if (join.getRight().isBroadcast()) {
        query.append("/*+ BROADCAST(").append(rightAlias).append(") */ ");
    }
    for (JoinField field : join.getFields()) {
        String outputName = field.getAlias() == null ? field.getFieldName() : field.getAlias();
        String datasetName = datasetAliases.getOrDefault(field.getStageName(), field.getStageName());
        // `datasetName`.`fieldName` as outputName
        query.append("`").append(datasetName).append("`.`").append(field.getFieldName()).append("` as ").append(outputName).append(", ");
    }
    // remove trailing ', '
    query.setLength(query.length() - 2);
    String joinType;
    boolean leftRequired = join.getLeft().isRequired();
    boolean rightRequired = join.getRight().isRequired();
    if (leftRequired && rightRequired) {
        joinType = "JOIN";
    } else if (leftRequired && !rightRequired) {
        joinType = "LEFT OUTER JOIN";
    } else if (!leftRequired && rightRequired) {
        joinType = "RIGHT OUTER JOIN";
    } else {
        joinType = "FULL OUTER JOIN";
    }
    // FROM `leftDataset` as `leftAlias` JOIN `rightDataset` as `rightAlias`
    query.append(" FROM `").append(leftName).append("` as `").append(leftAlias).append("` ");
    query.append(joinType).append(" `").append(rightName).append("` as `").append(rightAlias).append("` ");
    // ON [expr]
    query.append(" ON ").append(condition.getExpression());
    return query.toString();
}
Also used : JoinField(io.cdap.cdap.etl.api.join.JoinField) JoinCondition(io.cdap.cdap.etl.api.join.JoinCondition)

Example 2 with JoinField

use of io.cdap.cdap.etl.api.join.JoinField in project cdap by caskdata.

the class AutoJoinerTest method testAutoJoinWithMacros.

private void testAutoJoinWithMacros(Engine engine, List<String> required, Schema expectedSchema, Set<StructuredRecord> expectedRecords, boolean excludeUsers, boolean excludePurchases) throws Exception {
    /*
         users ------|
                     |--> join --> sink
         purchases --|

         joinOn: users.region = purchases.region and users.user_id = purchases.user_id
     */
    String userInput = UUID.randomUUID().toString();
    String purchaseInput = UUID.randomUUID().toString();
    String output = UUID.randomUUID().toString();
    Map<String, String> joinerProps = new HashMap<>();
    joinerProps.put(MockAutoJoiner.Conf.STAGES, "${stages}");
    joinerProps.put(MockAutoJoiner.Conf.KEY, "${key}");
    joinerProps.put(MockAutoJoiner.Conf.REQUIRED, "${required}");
    joinerProps.put(MockAutoJoiner.Conf.SELECT, "${select}");
    if (engine == Engine.SPARK || (required.size() < 2 && engine == Engine.MAPREDUCE)) {
        joinerProps.put(MockAutoJoiner.Conf.SCHEMA, "${schema}");
    }
    ETLBatchConfig config = ETLBatchConfig.builder().addStage(new ETLStage("users", MockSource.getPlugin(userInput))).addStage(new ETLStage("purchases", MockSource.getPlugin(purchaseInput))).addStage(new ETLStage("join", new ETLPlugin(MockAutoJoiner.NAME, BatchJoiner.PLUGIN_TYPE, joinerProps))).addStage(new ETLStage("sink", MockSink.getPlugin(output))).addConnection("users", "join").addConnection("purchases", "join").addConnection("join", "sink").setEngine(engine).build();
    AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, config);
    ApplicationId appId = NamespaceId.DEFAULT.app(UUID.randomUUID().toString());
    ApplicationManager appManager = deployApplication(appId, appRequest);
    // write input data
    if (!excludeUsers) {
        List<StructuredRecord> userData = Arrays.asList(USER_ALICE, USER_ALYCE, USER_BOB);
        DataSetManager<Table> inputManager = getDataset(userInput);
        MockSource.writeInput(inputManager, userData);
    }
    if (!excludePurchases) {
        List<StructuredRecord> purchaseData = new ArrayList<>();
        purchaseData.add(StructuredRecord.builder(PURCHASE_SCHEMA).set("region", "us").set("user_id", 0).set("purchase_id", 123).build());
        purchaseData.add(StructuredRecord.builder(PURCHASE_SCHEMA).set("region", "us").set("user_id", 2).set("purchase_id", 456).build());
        DataSetManager<Table> inputManager = getDataset(purchaseInput);
        MockSource.writeInput(inputManager, purchaseData);
    }
    WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
    List<JoinField> selectedFields = new ArrayList<>();
    selectedFields.add(new JoinField("purchases", "region"));
    selectedFields.add(new JoinField("purchases", "purchase_id"));
    selectedFields.add(new JoinField("purchases", "user_id"));
    selectedFields.add(new JoinField("users", "name"));
    Map<String, String> joinerProperties = MockAutoJoiner.getProperties(Arrays.asList("purchases", "users"), Arrays.asList("region", "user_id"), required, Collections.emptyList(), selectedFields, true);
    Map<String, String> runtimeArgs = new HashMap<>();
    runtimeArgs.put("stages", joinerProperties.get(MockAutoJoiner.Conf.STAGES));
    runtimeArgs.put("key", joinerProperties.get(MockAutoJoiner.Conf.KEY));
    runtimeArgs.put("required", joinerProperties.get(MockAutoJoiner.Conf.REQUIRED));
    runtimeArgs.put("select", joinerProperties.get(MockAutoJoiner.Conf.SELECT));
    runtimeArgs.put("schema", expectedSchema.toString());
    workflowManager.startAndWaitForGoodRun(runtimeArgs, ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
    DataSetManager<Table> outputManager = getDataset(output);
    List<StructuredRecord> outputRecords = MockSink.readOutput(outputManager);
    Assert.assertEquals(expectedRecords, new HashSet<>(outputRecords));
}
Also used : ApplicationManager(io.cdap.cdap.test.ApplicationManager) Table(io.cdap.cdap.api.dataset.table.Table) HashMap(java.util.HashMap) WorkflowManager(io.cdap.cdap.test.WorkflowManager) ArrayList(java.util.ArrayList) JoinField(io.cdap.cdap.etl.api.join.JoinField) ETLPlugin(io.cdap.cdap.etl.proto.v2.ETLPlugin) StructuredRecord(io.cdap.cdap.api.data.format.StructuredRecord) AppRequest(io.cdap.cdap.proto.artifact.AppRequest) ETLBatchConfig(io.cdap.cdap.etl.proto.v2.ETLBatchConfig) ETLStage(io.cdap.cdap.etl.proto.v2.ETLStage) ApplicationId(io.cdap.cdap.proto.id.ApplicationId)

Example 3 with JoinField

use of io.cdap.cdap.etl.api.join.JoinField in project cdap by caskdata.

the class AutoJoinerTest method testInnerBetweenCondition.

@Test
public void testInnerBetweenCondition() throws Exception {
    /*
         users ----------|
                         |--> join --> sink
         age_groups -----|

         joinOn: users.age > age_groups.lo and (users.age <= age_groups.hi or age_groups.hi is null)
     */
    Schema userSchema = Schema.recordOf("user", Schema.Field.of("name", Schema.of(Schema.Type.STRING)), Schema.Field.of("age", Schema.nullableOf(Schema.of(Schema.Type.INT))));
    Schema ageGroupSchema = Schema.recordOf("age_group", Schema.Field.of("name", Schema.of(Schema.Type.STRING)), Schema.Field.of("lo", Schema.of(Schema.Type.INT)), Schema.Field.of("hi", Schema.nullableOf(Schema.of(Schema.Type.INT))));
    Schema expectedSchema = Schema.recordOf("users.age_groups", Schema.Field.of("username", Schema.of(Schema.Type.STRING)), Schema.Field.of("age_group", Schema.of(Schema.Type.STRING)));
    String userInput = UUID.randomUUID().toString();
    String agesInput = UUID.randomUUID().toString();
    String output = UUID.randomUUID().toString();
    List<JoinField> select = new ArrayList<>();
    select.add(new JoinField("users", "name", "username"));
    select.add(new JoinField("age_groups", "name", "age_group"));
    JoinCondition.OnExpression condition = JoinCondition.onExpression().setExpression("users.age >= age_groups.lo and (users.age < age_groups.hi or age_groups.hi is null)").build();
    Map<String, String> joinerProperties = MockAutoJoiner.getProperties(Arrays.asList("users", "age_groups"), Collections.emptyList(), Arrays.asList("users", "age_groups"), Collections.emptyList(), select, false, null, condition);
    ETLBatchConfig config = ETLBatchConfig.builder().addStage(new ETLStage("users", MockSource.getPlugin(userInput, userSchema))).addStage(new ETLStage("age_groups", MockSource.getPlugin(agesInput, ageGroupSchema))).addStage(new ETLStage("join", new ETLPlugin(MockAutoJoiner.NAME, BatchJoiner.PLUGIN_TYPE, joinerProperties))).addStage(new ETLStage("sink", MockSink.getPlugin(output))).addConnection("users", "join").addConnection("age_groups", "join").addConnection("join", "sink").setEngine(Engine.SPARK).build();
    AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, config);
    ApplicationId appId = NamespaceId.DEFAULT.app(UUID.randomUUID().toString());
    ApplicationManager appManager = deployApplication(appId, appRequest);
    List<StructuredRecord> records = new ArrayList<>();
    records.add(StructuredRecord.builder(userSchema).set("name", "Alice").set("age", 35).build());
    records.add(StructuredRecord.builder(userSchema).set("name", "Bob").build());
    records.add(StructuredRecord.builder(userSchema).set("name", "Carl").set("age", 13).build());
    records.add(StructuredRecord.builder(userSchema).set("name", "Dave").set("age", 0).build());
    records.add(StructuredRecord.builder(userSchema).set("name", "Elaine").set("age", 68).build());
    records.add(StructuredRecord.builder(userSchema).set("name", "Fred").set("age", 4).build());
    DataSetManager<Table> inputManager = getDataset(userInput);
    MockSource.writeInput(inputManager, records);
    records.clear();
    records.add(StructuredRecord.builder(ageGroupSchema).set("name", "infant").set("lo", 0).set("hi", 2).build());
    records.add(StructuredRecord.builder(ageGroupSchema).set("name", "toddler").set("lo", 2).set("hi", 5).build());
    records.add(StructuredRecord.builder(ageGroupSchema).set("name", "child").set("lo", 5).set("hi", 13).build());
    records.add(StructuredRecord.builder(ageGroupSchema).set("name", "teen").set("lo", 13).set("hi", 20).build());
    records.add(StructuredRecord.builder(ageGroupSchema).set("name", "adult").set("lo", 20).set("hi", 65).build());
    records.add(StructuredRecord.builder(ageGroupSchema).set("name", "senior").set("lo", 65).build());
    inputManager = getDataset(agesInput);
    MockSource.writeInput(inputManager, records);
    WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
    workflowManager.startAndWaitForGoodRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
    DataSetManager<Table> outputManager = getDataset(output);
    List<StructuredRecord> outputRecords = MockSink.readOutput(outputManager);
    Set<StructuredRecord> expected = new HashSet<>();
    expected.add(StructuredRecord.builder(expectedSchema).set("username", "Alice").set("age_group", "adult").build());
    expected.add(StructuredRecord.builder(expectedSchema).set("username", "Carl").set("age_group", "teen").build());
    expected.add(StructuredRecord.builder(expectedSchema).set("username", "Dave").set("age_group", "infant").build());
    expected.add(StructuredRecord.builder(expectedSchema).set("username", "Elaine").set("age_group", "senior").build());
    expected.add(StructuredRecord.builder(expectedSchema).set("username", "Fred").set("age_group", "toddler").build());
    Assert.assertEquals(expected, new HashSet<>(outputRecords));
    validateMetric(6, appId, "users.records.out");
    validateMetric(6, appId, "age_groups.records.out");
    validateMetric(12, appId, "join.records.in");
    validateMetric(expected.size(), appId, "join.records.out");
}
Also used : ApplicationManager(io.cdap.cdap.test.ApplicationManager) Table(io.cdap.cdap.api.dataset.table.Table) Schema(io.cdap.cdap.api.data.schema.Schema) WorkflowManager(io.cdap.cdap.test.WorkflowManager) ArrayList(java.util.ArrayList) JoinField(io.cdap.cdap.etl.api.join.JoinField) ETLPlugin(io.cdap.cdap.etl.proto.v2.ETLPlugin) StructuredRecord(io.cdap.cdap.api.data.format.StructuredRecord) JoinCondition(io.cdap.cdap.etl.api.join.JoinCondition) AppRequest(io.cdap.cdap.proto.artifact.AppRequest) ETLBatchConfig(io.cdap.cdap.etl.proto.v2.ETLBatchConfig) ETLStage(io.cdap.cdap.etl.proto.v2.ETLStage) ApplicationId(io.cdap.cdap.proto.id.ApplicationId) HashSet(java.util.HashSet) Test(org.junit.Test)

Example 4 with JoinField

use of io.cdap.cdap.etl.api.join.JoinField in project cdap by caskdata.

the class AutoJoinerTest method testLeftOuterComplexConditionBroadcast.

@Test
public void testLeftOuterComplexConditionBroadcast() throws Exception {
    /*
         sales ----------|
                         |--> join --> sink
         categories -----|

         joinOn:
           sales.price > 1000 and sales.date > 2020-01-01 and
           (sales.category <=> categories.id or (sales.category is null and sales.department = categories.department))
     */
    Schema salesSchema = Schema.recordOf("sale", Schema.Field.of("id", Schema.of(Schema.Type.INT)), Schema.Field.of("price", Schema.of(Schema.Type.DOUBLE)), Schema.Field.of("date", Schema.of(Schema.LogicalType.DATETIME)), Schema.Field.of("category", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("department", Schema.nullableOf(Schema.of(Schema.Type.STRING))));
    Schema categorySchema = Schema.recordOf("category", Schema.Field.of("id", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("department", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("flag", Schema.nullableOf(Schema.of(Schema.Type.BOOLEAN))));
    Schema expectedSchema = Schema.recordOf("sales.categories", Schema.Field.of("id", Schema.of(Schema.Type.INT)), Schema.Field.of("flag", Schema.nullableOf(Schema.of(Schema.Type.BOOLEAN))));
    String salesInput = UUID.randomUUID().toString();
    String categoriesInput = UUID.randomUUID().toString();
    String output = UUID.randomUUID().toString();
    List<JoinField> select = new ArrayList<>();
    select.add(new JoinField("sales", "id"));
    select.add(new JoinField("categories", "flag"));
    /*
           sales.price > 1000 and sales.date > 2020-01-01 and
           (sales.category <=> categories.id or (sales.category is null and sales.department = categories.department))
     */
    JoinCondition.OnExpression condition = JoinCondition.onExpression().addDatasetAlias("sales", "S").addDatasetAlias("categories", "C").setExpression("S.price > 1000 and S.date > '2020-01-01 00:00:00' and " + "(S.category = C.id or (S.category is null and S.department = C.department))").build();
    Map<String, String> joinerProperties = MockAutoJoiner.getProperties(Arrays.asList("sales", "categories"), Collections.emptyList(), Collections.singletonList("sales"), Collections.singletonList("categories"), select, false, null, condition);
    ETLBatchConfig config = ETLBatchConfig.builder().addStage(new ETLStage("sales", MockSource.getPlugin(salesInput, salesSchema))).addStage(new ETLStage("categories", MockSource.getPlugin(categoriesInput, categorySchema))).addStage(new ETLStage("join", new ETLPlugin(MockAutoJoiner.NAME, BatchJoiner.PLUGIN_TYPE, joinerProperties))).addStage(new ETLStage("sink", MockSink.getPlugin(output))).addConnection("sales", "join").addConnection("categories", "join").addConnection("join", "sink").setEngine(Engine.SPARK).build();
    AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, config);
    ApplicationId appId = NamespaceId.DEFAULT.app(UUID.randomUUID().toString());
    ApplicationManager appManager = deployApplication(appId, appRequest);
    List<StructuredRecord> records = new ArrayList<>();
    records.add(StructuredRecord.builder(salesSchema).set("id", 0).set("price", 123.45d).set("date", "2021-01-01 00:00:00").set("category", "electronics").set("department", "entertainment").build());
    records.add(StructuredRecord.builder(salesSchema).set("id", 1).set("price", 1000.01d).set("date", "2020-01-01 00:00:01").set("department", "home").build());
    records.add(StructuredRecord.builder(salesSchema).set("id", 2).set("price", 5000d).set("date", "2021-01-01 00:00:00").set("category", "furniture").build());
    records.add(StructuredRecord.builder(salesSchema).set("id", 3).set("price", 2000d).set("date", "2019-12-31 23:59:59").set("category", "furniture").build());
    records.add(StructuredRecord.builder(salesSchema).set("id", 4).set("price", 2000d).set("date", "2020-01-01 12:00:00").set("category", "tv").set("department", "entertainment").build());
    DataSetManager<Table> inputManager = getDataset(salesInput);
    MockSource.writeInput(inputManager, records);
    records.clear();
    records.add(StructuredRecord.builder(categorySchema).set("id", "electronics").set("department", "entertainment").set("flag", false).build());
    records.add(StructuredRecord.builder(categorySchema).set("id", "furniture").set("department", "home").set("flag", true).build());
    records.add(StructuredRecord.builder(categorySchema).set("id", "tv").set("department", "entertainment").set("flag", false).build());
    inputManager = getDataset(categoriesInput);
    MockSource.writeInput(inputManager, records);
    WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
    workflowManager.startAndWaitForGoodRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
    DataSetManager<Table> outputManager = getDataset(output);
    List<StructuredRecord> outputRecords = MockSink.readOutput(outputManager);
    Set<StructuredRecord> expected = new HashSet<>();
    expected.add(StructuredRecord.builder(expectedSchema).set("id", 0).build());
    expected.add(StructuredRecord.builder(expectedSchema).set("id", 1).set("flag", true).build());
    expected.add(StructuredRecord.builder(expectedSchema).set("id", 2).set("flag", true).build());
    expected.add(StructuredRecord.builder(expectedSchema).set("id", 3).build());
    expected.add(StructuredRecord.builder(expectedSchema).set("id", 4).set("flag", false).build());
    Assert.assertEquals(expected, new HashSet<>(outputRecords));
}
Also used : ApplicationManager(io.cdap.cdap.test.ApplicationManager) Table(io.cdap.cdap.api.dataset.table.Table) Schema(io.cdap.cdap.api.data.schema.Schema) WorkflowManager(io.cdap.cdap.test.WorkflowManager) ArrayList(java.util.ArrayList) JoinField(io.cdap.cdap.etl.api.join.JoinField) ETLPlugin(io.cdap.cdap.etl.proto.v2.ETLPlugin) StructuredRecord(io.cdap.cdap.api.data.format.StructuredRecord) JoinCondition(io.cdap.cdap.etl.api.join.JoinCondition) AppRequest(io.cdap.cdap.proto.artifact.AppRequest) ETLBatchConfig(io.cdap.cdap.etl.proto.v2.ETLBatchConfig) ETLStage(io.cdap.cdap.etl.proto.v2.ETLStage) ApplicationId(io.cdap.cdap.proto.id.ApplicationId) HashSet(java.util.HashSet) Test(org.junit.Test)

Example 5 with JoinField

use of io.cdap.cdap.etl.api.join.JoinField in project cdap by caskdata.

the class JoinerBridge method generateOutputSchema.

private Schema generateOutputSchema(Iterable<JoinElement<INPUT_RECORD>> elements) {
    Map<String, Schema> stageSchemas = new HashMap<>();
    for (JoinElement<INPUT_RECORD> joinElement : elements) {
        StructuredRecord joinRecord = (StructuredRecord) joinElement.getInputRecord();
        stageSchemas.put(joinElement.getStageName(), joinRecord.getSchema());
    }
    List<Schema.Field> fields = new ArrayList<>(joinDefinition.getSelectedFields().size());
    for (JoinField joinField : joinDefinition.getSelectedFields()) {
        String originalName = joinField.getFieldName();
        String outputName = joinField.getAlias() == null ? originalName : joinField.getAlias();
        Schema stageSchema = stageSchemas.get(joinField.getStageName());
        if (stageSchema == null) {
            // should not be possible, should be validated earlier
            throw new IllegalArgumentException(String.format("Unable to select field '%s' from stage '%s' because data for the stage could not be found.", originalName, joinField.getStageName()));
        }
        Schema.Field stageField = stageSchema.getField(originalName);
        if (stageField == null) {
            // should not be possible, should be validated earlier
            throw new IllegalArgumentException(String.format("Unable to select field '%s' from stage '%s' because the field for the stage could not be found.", originalName, joinField.getStageName()));
        }
        fields.add(Schema.Field.of(outputName, stageField.getSchema()));
    }
    return Schema.recordOf("joined", fields);
}
Also used : HashMap(java.util.HashMap) Schema(io.cdap.cdap.api.data.schema.Schema) ArrayList(java.util.ArrayList) JoinField(io.cdap.cdap.etl.api.join.JoinField) StructuredRecord(io.cdap.cdap.api.data.format.StructuredRecord) JoinField(io.cdap.cdap.etl.api.join.JoinField)

Aggregations

JoinField (io.cdap.cdap.etl.api.join.JoinField)15 ArrayList (java.util.ArrayList)11 Schema (io.cdap.cdap.api.data.schema.Schema)10 StructuredRecord (io.cdap.cdap.api.data.format.StructuredRecord)7 JoinCondition (io.cdap.cdap.etl.api.join.JoinCondition)7 ETLBatchConfig (io.cdap.cdap.etl.proto.v2.ETLBatchConfig)6 ETLStage (io.cdap.cdap.etl.proto.v2.ETLStage)6 HashMap (java.util.HashMap)6 HashSet (java.util.HashSet)6 Test (org.junit.Test)5 Table (io.cdap.cdap.api.dataset.table.Table)4 ETLPlugin (io.cdap.cdap.etl.proto.v2.ETLPlugin)4 AppRequest (io.cdap.cdap.proto.artifact.AppRequest)4 ApplicationId (io.cdap.cdap.proto.id.ApplicationId)4 ApplicationManager (io.cdap.cdap.test.ApplicationManager)4 WorkflowManager (io.cdap.cdap.test.WorkflowManager)4 List (java.util.List)3 VisibleForTesting (com.google.common.annotations.VisibleForTesting)2 JoinKey (io.cdap.cdap.etl.api.join.JoinKey)2 JoinStage (io.cdap.cdap.etl.api.join.JoinStage)2