Search in sources :

Example 1 with AutoJoinerContext

use of io.cdap.cdap.etl.api.join.AutoJoinerContext in project cdap by caskdata.

the class PipelineSpecGenerator method configureAutoJoiner.

private void configureAutoJoiner(String stageName, AutoJoiner autoJoiner, DefaultStageConfigurer stageConfigurer, FailureCollector collector) {
    AutoJoinerContext autoContext = DefaultAutoJoinerContext.from(stageConfigurer.getInputSchemas(), collector);
    JoinDefinition joinDefinition = autoJoiner.define(autoContext);
    if (joinDefinition == null) {
        return;
    }
    validateJoinCondition(stageName, joinDefinition.getCondition(), collector);
    stageConfigurer.setOutputSchema(joinDefinition.getOutputSchema());
    Set<String> inputStages = stageConfigurer.getInputSchemas().keySet();
    Set<String> joinStages = joinDefinition.getStages().stream().map(JoinStage::getStageName).collect(Collectors.toSet());
    Set<String> missingInputs = Sets.difference(inputStages, joinStages);
    if (!missingInputs.isEmpty()) {
        collector.addFailure(String.format("Joiner stage '%s' did not include input stage %s in the join.", stageName, String.join(", ", missingInputs)), "Check with the plugin developer to make sure it is implemented correctly.");
    }
    Set<String> extraInputs = Sets.difference(joinStages, inputStages);
    if (!extraInputs.isEmpty()) {
        collector.addFailure(String.format("Joiner stage '%s' is trying to join stage %s, which is not an input.", stageName, String.join(", ", missingInputs)), "Check with the plugin developer to make sure it is implemented correctly.");
    }
}
Also used : DefaultAutoJoinerContext(io.cdap.cdap.etl.common.DefaultAutoJoinerContext) AutoJoinerContext(io.cdap.cdap.etl.api.join.AutoJoinerContext) JoinDefinition(io.cdap.cdap.etl.api.join.JoinDefinition)

Example 2 with AutoJoinerContext

use of io.cdap.cdap.etl.api.join.AutoJoinerContext in project cdap by caskdata.

the class PipelinePhasePreparer method validateAutoJoiner.

private void validateAutoJoiner(AutoJoiner autoJoiner, StageSpec stageSpec) {
    // validate that the join definition is not null
    // it could be null at configure time due to macros not being evaluated, but at this
    // point all macros should be evaluated and the definition should be non-null.
    String stageName = stageSpec.getName();
    String pluginName = stageSpec.getPlugin().getName();
    FailureCollector failureCollector = new LoggingFailureCollector(stageSpec.getName(), stageSpec.getInputSchemas());
    AutoJoinerContext autoJoinerContext = DefaultAutoJoinerContext.from(stageSpec.getInputSchemas(), failureCollector);
    JoinDefinition joinDefinition = autoJoiner.define(autoJoinerContext);
    failureCollector.getOrThrowException();
    if (joinDefinition == null) {
        throw new IllegalArgumentException(String.format("Joiner stage '%s' using plugin '%s' did not provide a join definition. " + "Check with the plugin developer to make sure it is implemented correctly.", stageName, pluginName));
    }
    // validate that the stages mentioned in the join definition are actually inputs into the joiner.
    Set<String> inputStages = stageSpec.getInputSchemas().keySet();
    Set<String> joinStages = joinDefinition.getStages().stream().map(JoinStage::getStageName).collect(Collectors.toSet());
    Set<String> missingInputs = Sets.difference(inputStages, joinStages);
    if (!missingInputs.isEmpty()) {
        throw new IllegalArgumentException(String.format("Joiner stage '%s' using plugin '%s' did not include input stage %s in the join. " + "Check with the plugin developer to make sure it is implemented correctly.", stageName, pluginName, String.join(", ", missingInputs)));
    }
    Set<String> extraInputs = Sets.difference(joinStages, inputStages);
    if (!extraInputs.isEmpty()) {
        throw new IllegalArgumentException(String.format("Joiner stage '%s' using plugin '%s' is trying to join stage %s, which is not an input. " + "Check with the plugin developer to make sure it is implemented correctly.", stageName, pluginName, String.join(", ", missingInputs)));
    }
}
Also used : LoggingFailureCollector(io.cdap.cdap.etl.validation.LoggingFailureCollector) DefaultAutoJoinerContext(io.cdap.cdap.etl.common.DefaultAutoJoinerContext) AutoJoinerContext(io.cdap.cdap.etl.api.join.AutoJoinerContext) JoinDefinition(io.cdap.cdap.etl.api.join.JoinDefinition) LoggingFailureCollector(io.cdap.cdap.etl.validation.LoggingFailureCollector) FailureCollector(io.cdap.cdap.etl.api.FailureCollector)

Example 3 with AutoJoinerContext

use of io.cdap.cdap.etl.api.join.AutoJoinerContext in project cdap by caskdata.

the class SparkPipelineRunner method handleJoin.

protected SparkCollection<Object> handleJoin(Map<String, SparkCollection<Object>> inputDataCollections, PipelinePhase pipelinePhase, PluginFunctionContext pluginFunctionContext, StageSpec stageSpec, FunctionCache.Factory functionCacheFactory, Object plugin, Integer numPartitions, StageStatisticsCollector collector, Set<String> shufflers) throws Exception {
    String stageName = stageSpec.getName();
    if (plugin instanceof BatchJoiner) {
        BatchJoiner<Object, Object, Object> joiner = (BatchJoiner<Object, Object, Object>) plugin;
        BatchJoinerRuntimeContext joinerRuntimeContext = pluginFunctionContext.createBatchRuntimeContext();
        joiner.initialize(joinerRuntimeContext);
        shufflers.add(stageName);
        return handleJoin(joiner, inputDataCollections, stageSpec, functionCacheFactory, numPartitions, collector);
    } else if (plugin instanceof AutoJoiner) {
        AutoJoiner autoJoiner = (AutoJoiner) plugin;
        Map<String, Schema> inputSchemas = new HashMap<>();
        for (String inputStageName : pipelinePhase.getStageInputs(stageName)) {
            StageSpec inputStageSpec = pipelinePhase.getStage(inputStageName);
            inputSchemas.put(inputStageName, inputStageSpec.getOutputSchema());
        }
        FailureCollector failureCollector = new LoggingFailureCollector(stageName, inputSchemas);
        AutoJoinerContext autoJoinerContext = DefaultAutoJoinerContext.from(inputSchemas, failureCollector);
        // joinDefinition will always be non-null because
        // it is checked by PipelinePhasePreparer at the start of the run.
        JoinDefinition joinDefinition = autoJoiner.define(autoJoinerContext);
        failureCollector.getOrThrowException();
        if (joinDefinition.getStages().stream().noneMatch(JoinStage::isBroadcast)) {
            shufflers.add(stageName);
        }
        return handleAutoJoin(stageName, joinDefinition, inputDataCollections, numPartitions);
    } else {
        // should never happen unless there is a bug in the code. should have failed during deployment
        throw new IllegalStateException(String.format("Stage '%s' is an unknown joiner type %s", stageName, plugin.getClass().getName()));
    }
}
Also used : BatchJoinerRuntimeContext(io.cdap.cdap.etl.api.batch.BatchJoinerRuntimeContext) LoggingFailureCollector(io.cdap.cdap.etl.validation.LoggingFailureCollector) BatchJoiner(io.cdap.cdap.etl.api.batch.BatchJoiner) DefaultAutoJoinerContext(io.cdap.cdap.etl.common.DefaultAutoJoinerContext) AutoJoinerContext(io.cdap.cdap.etl.api.join.AutoJoinerContext) JoinDefinition(io.cdap.cdap.etl.api.join.JoinDefinition) StageSpec(io.cdap.cdap.etl.proto.v2.spec.StageSpec) AutoJoiner(io.cdap.cdap.etl.api.join.AutoJoiner) Map(java.util.Map) HashMap(java.util.HashMap) LoggingFailureCollector(io.cdap.cdap.etl.validation.LoggingFailureCollector) FailureCollector(io.cdap.cdap.etl.api.FailureCollector)

Example 4 with AutoJoinerContext

use of io.cdap.cdap.etl.api.join.AutoJoinerContext in project cdap by caskdata.

the class JoinMergeFunction method createInitializedJoiner.

private <K, V, O> BatchJoiner<K, V, O> createInitializedJoiner() throws Exception {
    Object plugin = pluginFunctionContext.createPlugin();
    BatchJoiner<K, V, O> joiner;
    if (plugin instanceof BatchAutoJoiner) {
        String stageName = pluginFunctionContext.getStageName();
        BatchAutoJoiner autoJoiner = (BatchAutoJoiner) plugin;
        AutoJoinerContext autoJoinerContext = pluginFunctionContext.createAutoJoinerContext();
        JoinDefinition joinDefinition = autoJoiner.define(autoJoinerContext);
        autoJoinerContext.getFailureCollector().getOrThrowException();
        if (joinDefinition == null) {
            throw new IllegalStateException(String.format("Join stage '%s' did not specify a join definition. " + "Check with the plugin developer to ensure it is implemented correctly.", stageName));
        }
        joiner = new JoinerBridge(stageName, autoJoiner, joinDefinition);
    } else {
        joiner = (BatchJoiner<K, V, O>) plugin;
        BatchJoinerRuntimeContext context = pluginFunctionContext.createBatchRuntimeContext();
        joiner.initialize(context);
    }
    return joiner;
}
Also used : BatchJoinerRuntimeContext(io.cdap.cdap.etl.api.batch.BatchJoinerRuntimeContext) BatchAutoJoiner(io.cdap.cdap.etl.api.batch.BatchAutoJoiner) AutoJoinerContext(io.cdap.cdap.etl.api.join.AutoJoinerContext) JoinDefinition(io.cdap.cdap.etl.api.join.JoinDefinition) JoinerBridge(io.cdap.cdap.etl.common.plugin.JoinerBridge)

Example 5 with AutoJoinerContext

use of io.cdap.cdap.etl.api.join.AutoJoinerContext in project cdap by caskdata.

the class JoinOnFunction method createInitializedJoinOnTransform.

private JoinOnTransform<INPUT_RECORD, JOIN_KEY> createInitializedJoinOnTransform() throws Exception {
    Object plugin = pluginFunctionContext.createPlugin();
    BatchJoiner<JOIN_KEY, INPUT_RECORD, Object> joiner;
    boolean filterNullKeys = false;
    if (plugin instanceof BatchAutoJoiner) {
        BatchAutoJoiner autoJoiner = (BatchAutoJoiner) plugin;
        AutoJoinerContext autoJoinerContext = pluginFunctionContext.createAutoJoinerContext();
        JoinDefinition joinDefinition = autoJoiner.define(autoJoinerContext);
        autoJoinerContext.getFailureCollector().getOrThrowException();
        String stageName = pluginFunctionContext.getStageName();
        if (joinDefinition == null) {
            throw new IllegalStateException(String.format("Join stage '%s' did not specify a join definition. " + "Check with the plugin developer to ensure it is implemented correctly.", stageName));
        }
        JoinCondition condition = joinDefinition.getCondition();
        /*
         Filter out the record if it comes from an optional stage
         and the key is null, or if any of the fields in the key is null.
         For example, suppose we are performing a left outer join on:

          A (id, name) = (0, alice), (null, bob)
          B (id, email) = (0, alice@example.com), (null, placeholder@example.com)

         The final output should be:

         joined (A.id, A.name, B.email) = (0, alice, alice@example.com), (null, bob, null, null)

         that is, the bob record should not be joined to the placeholder@example email, even though both their
         ids are null.
       */
        if (condition.getOp() == JoinCondition.Op.KEY_EQUALITY && !((JoinCondition.OnKeys) condition).isNullSafe()) {
            filterNullKeys = joinDefinition.getStages().stream().filter(s -> !s.isRequired()).map(JoinStage::getStageName).anyMatch(s -> s.equals(inputStageName));
        }
        joiner = new JoinerBridge(stageName, autoJoiner, joinDefinition);
    } else {
        joiner = (BatchJoiner<JOIN_KEY, INPUT_RECORD, Object>) plugin;
        BatchJoinerRuntimeContext context = pluginFunctionContext.createBatchRuntimeContext();
        joiner.initialize(context);
    }
    return new JoinOnTransform<>(joiner, inputStageName, filterNullKeys);
}
Also used : BatchJoiner(io.cdap.cdap.etl.api.batch.BatchJoiner) Transformation(io.cdap.cdap.etl.api.Transformation) BatchJoinerRuntimeContext(io.cdap.cdap.etl.api.batch.BatchJoinerRuntimeContext) PairFlatMapFunction(org.apache.spark.api.java.function.PairFlatMapFunction) Iterator(java.util.Iterator) JoinStage(io.cdap.cdap.etl.api.join.JoinStage) JoinerBridge(io.cdap.cdap.etl.common.plugin.JoinerBridge) Tuple2(scala.Tuple2) Schema(io.cdap.cdap.api.data.schema.Schema) Constants(io.cdap.cdap.etl.common.Constants) StructuredRecord(io.cdap.cdap.api.data.format.StructuredRecord) TrackedTransform(io.cdap.cdap.etl.common.TrackedTransform) Emitter(io.cdap.cdap.etl.api.Emitter) DefaultEmitter(io.cdap.cdap.etl.common.DefaultEmitter) JoinDefinition(io.cdap.cdap.etl.api.join.JoinDefinition) AutoJoinerContext(io.cdap.cdap.etl.api.join.AutoJoinerContext) JoinCondition(io.cdap.cdap.etl.api.join.JoinCondition) BatchAutoJoiner(io.cdap.cdap.etl.api.batch.BatchAutoJoiner) BatchJoinerRuntimeContext(io.cdap.cdap.etl.api.batch.BatchJoinerRuntimeContext) JoinStage(io.cdap.cdap.etl.api.join.JoinStage) JoinCondition(io.cdap.cdap.etl.api.join.JoinCondition) BatchAutoJoiner(io.cdap.cdap.etl.api.batch.BatchAutoJoiner) AutoJoinerContext(io.cdap.cdap.etl.api.join.AutoJoinerContext) JoinDefinition(io.cdap.cdap.etl.api.join.JoinDefinition) JoinerBridge(io.cdap.cdap.etl.common.plugin.JoinerBridge)

Aggregations

AutoJoinerContext (io.cdap.cdap.etl.api.join.AutoJoinerContext)6 JoinDefinition (io.cdap.cdap.etl.api.join.JoinDefinition)6 BatchJoinerRuntimeContext (io.cdap.cdap.etl.api.batch.BatchJoinerRuntimeContext)4 DefaultAutoJoinerContext (io.cdap.cdap.etl.common.DefaultAutoJoinerContext)4 FailureCollector (io.cdap.cdap.etl.api.FailureCollector)3 BatchAutoJoiner (io.cdap.cdap.etl.api.batch.BatchAutoJoiner)3 BatchJoiner (io.cdap.cdap.etl.api.batch.BatchJoiner)3 JoinerBridge (io.cdap.cdap.etl.common.plugin.JoinerBridge)3 LoggingFailureCollector (io.cdap.cdap.etl.validation.LoggingFailureCollector)3 Schema (io.cdap.cdap.api.data.schema.Schema)2 StageSpec (io.cdap.cdap.etl.proto.v2.spec.StageSpec)2 HashMap (java.util.HashMap)2 StructuredRecord (io.cdap.cdap.api.data.format.StructuredRecord)1 Emitter (io.cdap.cdap.etl.api.Emitter)1 Transformation (io.cdap.cdap.etl.api.Transformation)1 AutoJoiner (io.cdap.cdap.etl.api.join.AutoJoiner)1 JoinCondition (io.cdap.cdap.etl.api.join.JoinCondition)1 JoinStage (io.cdap.cdap.etl.api.join.JoinStage)1 Constants (io.cdap.cdap.etl.common.Constants)1 DefaultEmitter (io.cdap.cdap.etl.common.DefaultEmitter)1