Search in sources :

Example 6 with JoinError

use of io.cdap.cdap.etl.api.join.error.JoinError in project cdap by cdapio.

the class InvalidJoinException method getMessage.

private static String getMessage(Collection<JoinError> errors) {
    if (errors.isEmpty()) {
        throw new IllegalStateException("An invalid join must contain at least one error, " + "or it must provide an error message.");
    }
    JoinError error = errors.iterator().next();
    String message = error.getMessage();
    return String.format("%s%s %s", message, message.endsWith(".") ? "" : ".", error.getCorrectiveAction());
}
Also used : JoinError(io.cdap.cdap.etl.api.join.error.JoinError)

Example 7 with JoinError

use of io.cdap.cdap.etl.api.join.error.JoinError in project hydrator-plugins by cdapio.

the class Joiner method define.

@Nullable
@Override
public JoinDefinition define(AutoJoinerContext context) {
    FailureCollector collector = context.getFailureCollector();
    boolean hasUnknownInputSchema = context.getInputStages().values().stream().anyMatch(Objects::isNull);
    if (hasUnknownInputSchema && !conf.containsMacro(JoinerConfig.OUTPUT_SCHEMA) && conf.getOutputSchema(collector) == null) {
        // If input schemas are unknown, an output schema must be provided.
        collector.addFailure("Output schema must be specified", null).withConfigProperty(JoinerConfig.OUTPUT_SCHEMA);
    }
    if (conf.requiredPropertiesContainMacros()) {
        return null;
    }
    Set<String> requiredStages = conf.getRequiredInputs();
    Set<String> broadcastStages = conf.getBroadcastInputs();
    List<JoinStage> inputs = new ArrayList<>(context.getInputStages().size());
    boolean useOutputSchema = false;
    for (JoinStage joinStage : context.getInputStages().values()) {
        inputs.add(JoinStage.builder(joinStage).setRequired(requiredStages.contains(joinStage.getStageName())).setBroadcast(broadcastStages.contains(joinStage.getStageName())).build());
        useOutputSchema = useOutputSchema || joinStage.getSchema() == null;
    }
    JoinCondition condition = conf.getCondition(collector);
    if (condition.getOp() == JoinCondition.Op.EXPRESSION) {
        if (inputs.size() != 2) {
            collector.addFailure("Advanced join conditions can only be used when there are two inputs.", null).withConfigProperty(JoinerConfig.CONDITION_TYPE);
            throw collector.getOrThrowException();
        }
        /*
         If this is an outer join of some kind and it is not a broadcast join, add a failure.
         this is because any outer join that is not an equality join in Spark will get turned into
         a BroadcastNestedLoopJoin anyway. So it is better to make that behavior explicit to the user
         and force them to specify which side should be broadcast. This also prevents problems where
         Spark will just choose to broadcast the right side because it doesn't know how big the input datasets are.
         See CDAP-17718 for more info.
       */
        if (requiredStages.size() < inputs.size() && broadcastStages.isEmpty()) {
            collector.addFailure("Advanced outer joins must specify an input to load in memory.", null).withConfigProperty(JoinerConfig.MEMORY_INPUTS);
        }
    }
    // Validate Join Left Side property
    if (!conf.mostSkewedInputContainsMacro() && !Strings.isNullOrEmpty(conf.getMostSkewedInput()) && inputs.stream().map(JoinStage::getStageName).noneMatch(sn -> Objects.equals(sn, conf.getMostSkewedInput()))) {
        collector.addFailure("Only one stage can be specified as the stage with the larger skew.", "Please select only one stage.").withConfigProperty(JoinerConfig.MOST_SKEWED_INPUT);
    }
    try {
        JoinDefinition.Builder joinBuilder = JoinDefinition.builder();
        // always first.
        if (!conf.mostSkewedInputContainsMacro() && !Strings.isNullOrEmpty(conf.getMostSkewedInput())) {
            reorderJoinStages(inputs, conf.getMostSkewedInput());
        }
        joinBuilder.select(conf.getSelectedFields(collector)).from(inputs).on(condition);
        if (useOutputSchema) {
            joinBuilder.setOutputSchema(conf.getOutputSchema(collector));
        } else {
            joinBuilder.setOutputSchemaName("join.output");
        }
        if (conf.isDistributionValid(collector)) {
            joinBuilder.setDistributionFactor(conf.getDistributionFactor(), conf.getDistributionStageName());
        }
        return joinBuilder.build();
    } catch (InvalidJoinException e) {
        if (e.getErrors().isEmpty()) {
            collector.addFailure(e.getMessage(), null);
        }
        for (JoinError error : e.getErrors()) {
            ValidationFailure failure = collector.addFailure(error.getMessage(), error.getCorrectiveAction());
            switch(error.getType()) {
                case JOIN_KEY:
                case JOIN_KEY_FIELD:
                    failure.withConfigProperty(JoinerConfig.JOIN_KEYS);
                    break;
                case SELECTED_FIELD:
                    JoinField badField = ((SelectedFieldError) error).getField();
                    failure.withConfigElement(JoinerConfig.SELECTED_FIELDS, String.format("%s.%s as %s", badField.getStageName(), badField.getFieldName(), badField.getAlias()));
                    break;
                case OUTPUT_SCHEMA:
                    OutputSchemaError schemaError = (OutputSchemaError) error;
                    failure.withOutputSchemaField(schemaError.getField());
                    break;
                case DISTRIBUTION_SIZE:
                    failure.withConfigProperty(JoinerConfig.DISTRIBUTION_FACTOR);
                    break;
                case DISTRIBUTION_STAGE:
                    failure.withConfigProperty(JoinerConfig.DISTRIBUTION_STAGE);
                    break;
                case BROADCAST:
                    failure.withConfigProperty(JoinerConfig.MEMORY_INPUTS);
                    break;
                case INVALID_CONDITION:
                    failure.withConfigProperty(JoinerConfig.CONDITION_EXPR);
            }
        }
        throw collector.getOrThrowException();
    }
}
Also used : FieldOperation(io.cdap.cdap.etl.api.lineage.field.FieldOperation) Description(io.cdap.cdap.api.annotation.Description) BatchJoinerContext(io.cdap.cdap.etl.api.batch.BatchJoinerContext) LoggerFactory(org.slf4j.LoggerFactory) FailureCollector(io.cdap.cdap.etl.api.FailureCollector) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) Strings(com.google.common.base.Strings) Name(io.cdap.cdap.api.annotation.Name) Map(java.util.Map) InvalidJoinException(io.cdap.cdap.etl.api.join.InvalidJoinException) BatchAutoJoiner(io.cdap.cdap.etl.api.batch.BatchAutoJoiner) LinkedList(java.util.LinkedList) LinkedHashSet(java.util.LinkedHashSet) Nullable(javax.annotation.Nullable) BatchJoiner(io.cdap.cdap.etl.api.batch.BatchJoiner) JoinError(io.cdap.cdap.etl.api.join.error.JoinError) FieldTransformOperation(io.cdap.cdap.etl.api.lineage.field.FieldTransformOperation) Logger(org.slf4j.Logger) JoinField(io.cdap.cdap.etl.api.join.JoinField) JoinStage(io.cdap.cdap.etl.api.join.JoinStage) Set(java.util.Set) Plugin(io.cdap.cdap.api.annotation.Plugin) Collectors(java.util.stream.Collectors) SelectedFieldError(io.cdap.cdap.etl.api.join.error.SelectedFieldError) JoinKey(io.cdap.cdap.etl.api.join.JoinKey) Objects(java.util.Objects) List(java.util.List) ValidationFailure(io.cdap.cdap.etl.api.validation.ValidationFailure) OutputSchemaError(io.cdap.cdap.etl.api.join.error.OutputSchemaError) JoinDefinition(io.cdap.cdap.etl.api.join.JoinDefinition) AutoJoinerContext(io.cdap.cdap.etl.api.join.AutoJoinerContext) VisibleForTesting(com.google.common.annotations.VisibleForTesting) JoinCondition(io.cdap.cdap.etl.api.join.JoinCondition) Collections(java.util.Collections) JoinStage(io.cdap.cdap.etl.api.join.JoinStage) ArrayList(java.util.ArrayList) InvalidJoinException(io.cdap.cdap.etl.api.join.InvalidJoinException) JoinField(io.cdap.cdap.etl.api.join.JoinField) OutputSchemaError(io.cdap.cdap.etl.api.join.error.OutputSchemaError) JoinCondition(io.cdap.cdap.etl.api.join.JoinCondition) ValidationFailure(io.cdap.cdap.etl.api.validation.ValidationFailure) JoinError(io.cdap.cdap.etl.api.join.error.JoinError) JoinDefinition(io.cdap.cdap.etl.api.join.JoinDefinition) Objects(java.util.Objects) FailureCollector(io.cdap.cdap.etl.api.FailureCollector) Nullable(javax.annotation.Nullable)

Aggregations

JoinError (io.cdap.cdap.etl.api.join.error.JoinError)7 OutputSchemaError (io.cdap.cdap.etl.api.join.error.OutputSchemaError)3 ArrayList (java.util.ArrayList)3 Map (java.util.Map)3 Schema (io.cdap.cdap.api.data.schema.Schema)2 BroadcastError (io.cdap.cdap.etl.api.join.error.BroadcastError)2 DistributionSizeError (io.cdap.cdap.etl.api.join.error.DistributionSizeError)2 DistributionStageError (io.cdap.cdap.etl.api.join.error.DistributionStageError)2 Collection (java.util.Collection)2 HashMap (java.util.HashMap)2 Test (org.junit.Test)2 VisibleForTesting (com.google.common.annotations.VisibleForTesting)1 Strings (com.google.common.base.Strings)1 Description (io.cdap.cdap.api.annotation.Description)1 Name (io.cdap.cdap.api.annotation.Name)1 Plugin (io.cdap.cdap.api.annotation.Plugin)1 FailureCollector (io.cdap.cdap.etl.api.FailureCollector)1 BatchAutoJoiner (io.cdap.cdap.etl.api.batch.BatchAutoJoiner)1 BatchJoiner (io.cdap.cdap.etl.api.batch.BatchJoiner)1 BatchJoinerContext (io.cdap.cdap.etl.api.batch.BatchJoinerContext)1