Search in sources :

Example 1 with JoinKey

use of io.cdap.cdap.etl.api.join.JoinKey in project cdap by caskdata.

the class MockAutoJoiner method define.

@Nullable
@Override
public JoinDefinition define(AutoJoinerContext context) {
    if (conf.containsMacro(Conf.STAGES) || conf.containsMacro(Conf.KEY) || conf.containsMacro(Conf.REQUIRED) || conf.containsMacro(Conf.SELECT)) {
        return null;
    }
    Map<String, JoinStage> inputStages = context.getInputStages();
    List<JoinStage> from = new ArrayList<>(inputStages.size());
    Set<String> required = new HashSet<>(conf.getRequired());
    Set<String> broadcast = new HashSet<>(conf.getBroadcast());
    List<JoinField> selectedFields = conf.getSelect();
    boolean shouldGenerateSelected = selectedFields.isEmpty();
    JoinCondition condition = conf.getJoinConditionExpr();
    JoinCondition.OnKeys.Builder conditionBuilder = condition != null ? null : JoinCondition.onKeys().setNullSafe(conf.isNullSafe());
    for (String stageName : conf.getStages()) {
        JoinStage.Builder stageBuilder = JoinStage.builder(inputStages.get(stageName));
        if (!required.contains(stageName)) {
            stageBuilder.isOptional();
        }
        if (broadcast.contains(stageName)) {
            stageBuilder.setBroadcast(true);
        }
        JoinStage stage = stageBuilder.build();
        from.add(stage);
        if (conditionBuilder != null) {
            conditionBuilder.addKey(new JoinKey(stageName, conf.getKey()));
        }
        Schema stageSchema = stage.getSchema();
        if (!shouldGenerateSelected || stageSchema == null) {
            continue;
        }
        for (Schema.Field field : stageSchema.getFields()) {
            // alias everything to stage_field
            selectedFields.add(new JoinField(stageName, field.getName(), String.format("%s_%s", stageName, field.getName())));
        }
    }
    condition = condition == null ? conditionBuilder.build() : condition;
    JoinDefinition.Builder builder = JoinDefinition.builder().select(selectedFields).on(condition).from(from).setOutputSchemaName(String.join(".", conf.getStages()));
    Schema outputSchema = conf.getSchema();
    if (outputSchema != null) {
        builder.setOutputSchema(outputSchema);
    }
    if (conf.getDistributionName() != null && conf.getDistributionSize() != null) {
        builder.setDistributionFactor(conf.getDistributionSize(), conf.getDistributionName());
    }
    return builder.build();
}
Also used : JoinStage(io.cdap.cdap.etl.api.join.JoinStage) JoinKey(io.cdap.cdap.etl.api.join.JoinKey) Schema(io.cdap.cdap.api.data.schema.Schema) ArrayList(java.util.ArrayList) JoinField(io.cdap.cdap.etl.api.join.JoinField) JoinCondition(io.cdap.cdap.etl.api.join.JoinCondition) JoinDefinition(io.cdap.cdap.etl.api.join.JoinDefinition) HashSet(java.util.HashSet) Nullable(javax.annotation.Nullable)

Example 2 with JoinKey

use of io.cdap.cdap.etl.api.join.JoinKey in project cdap by caskdata.

the class PipelineSpecGeneratorTest method testAutoJoin.

@Test
public void testAutoJoin() {
    /*
     *           ---- transformA --------|
     *           |                       |
     * source ---|                       |-- autojoin --- sink
     *           |                       |
     *           ---- transformABC ------|
     */
    ETLBatchConfig config = ETLBatchConfig.builder().setTimeSchedule("* * * * *").addStage(new ETLStage("source", MOCK_SOURCE)).addStage(new ETLStage("tA", MOCK_TRANSFORM_A)).addStage(new ETLStage("tABC", MOCK_TRANSFORM_ABC)).addStage(new ETLStage("autojoin", MOCK_AUTO_JOINER)).addStage(new ETLStage("sink", MOCK_SINK)).addConnection("source", "tA").addConnection("source", "tABC").addConnection("tA", "autojoin").addConnection("tABC", "autojoin").addConnection("autojoin", "sink").setNumOfRecordsPreview(100).build();
    joinDefinition = JoinDefinition.builder().select(new JoinField("tA", "a"), new JoinField("tABC", "b"), new JoinField("tABC", "c")).from(JoinStage.builder("tA", SCHEMA_A).isRequired().build(), JoinStage.builder("tABC", SCHEMA_ABC).isOptional().build()).on(JoinCondition.onKeys().addKey(new JoinKey("tA", Collections.singletonList("a"))).addKey(new JoinKey("tABC", Collections.singletonList("a"))).build()).setOutputSchemaName("abc.joined").build();
    Schema joinSchema = Schema.recordOf("abc.joined", Schema.Field.of("a", Schema.of(Schema.Type.STRING)), Schema.Field.of("b", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("c", Schema.nullableOf(Schema.of(Schema.Type.INT))));
    Map<String, String> emptyMap = new HashMap<>();
    PipelineSpec expected = BatchPipelineSpec.builder().addStage(StageSpec.builder("source", new PluginSpec(BatchSource.PLUGIN_TYPE, "mocksource", emptyMap, ARTIFACT_ID)).addOutput(SCHEMA_A, "tA", "tABC").build()).addStage(StageSpec.builder("tA", new PluginSpec(Transform.PLUGIN_TYPE, "mockA", emptyMap, ARTIFACT_ID)).addInputSchema("source", SCHEMA_A).addOutput(SCHEMA_A, "autojoin").setErrorSchema(SCHEMA_B).build()).addStage(StageSpec.builder("tABC", new PluginSpec(Transform.PLUGIN_TYPE, "mockABC", emptyMap, ARTIFACT_ID)).addInputSchema("source", SCHEMA_A).addOutput(SCHEMA_ABC, "autojoin").setErrorSchema(SCHEMA_A).build()).addStage(StageSpec.builder("autojoin", new PluginSpec(BatchJoiner.PLUGIN_TYPE, "mockautojoiner", emptyMap, ARTIFACT_ID)).addInputSchema("tA", SCHEMA_A).addInputSchema("tABC", SCHEMA_ABC).addOutput(joinSchema, "sink").setErrorSchema(SCHEMA_ABC).build()).addStage(StageSpec.builder("sink", new PluginSpec(BatchSink.PLUGIN_TYPE, "mocksink", emptyMap, ARTIFACT_ID)).addInputSchema("autojoin", joinSchema).setErrorSchema(joinSchema).build()).addConnections(config.getConnections()).setResources(config.getResources()).setDriverResources(config.getDriverResources()).setClientResources(config.getClientResources()).setStageLoggingEnabled(config.isStageLoggingEnabled()).setNumOfRecordsPreview(config.getNumOfRecordsPreview()).build();
    PipelineSpec actual = specGenerator.generateSpec(config);
    Assert.assertEquals(expected, actual);
}
Also used : ETLBatchConfig(io.cdap.cdap.etl.proto.v2.ETLBatchConfig) PluginSpec(io.cdap.cdap.etl.proto.v2.spec.PluginSpec) JoinKey(io.cdap.cdap.etl.api.join.JoinKey) ETLStage(io.cdap.cdap.etl.proto.v2.ETLStage) HashMap(java.util.HashMap) PipelineSpec(io.cdap.cdap.etl.proto.v2.spec.PipelineSpec) BatchPipelineSpec(io.cdap.cdap.etl.batch.BatchPipelineSpec) Schema(io.cdap.cdap.api.data.schema.Schema) JoinField(io.cdap.cdap.etl.api.join.JoinField) Test(org.junit.Test)

Aggregations

Schema (io.cdap.cdap.api.data.schema.Schema)2 JoinField (io.cdap.cdap.etl.api.join.JoinField)2 JoinKey (io.cdap.cdap.etl.api.join.JoinKey)2 JoinCondition (io.cdap.cdap.etl.api.join.JoinCondition)1 JoinDefinition (io.cdap.cdap.etl.api.join.JoinDefinition)1 JoinStage (io.cdap.cdap.etl.api.join.JoinStage)1 BatchPipelineSpec (io.cdap.cdap.etl.batch.BatchPipelineSpec)1 ETLBatchConfig (io.cdap.cdap.etl.proto.v2.ETLBatchConfig)1 ETLStage (io.cdap.cdap.etl.proto.v2.ETLStage)1 PipelineSpec (io.cdap.cdap.etl.proto.v2.spec.PipelineSpec)1 PluginSpec (io.cdap.cdap.etl.proto.v2.spec.PluginSpec)1 ArrayList (java.util.ArrayList)1 HashMap (java.util.HashMap)1 HashSet (java.util.HashSet)1 Nullable (javax.annotation.Nullable)1 Test (org.junit.Test)1