Search in sources :

Example 1 with OutputSchemaError

use of io.cdap.cdap.etl.api.join.error.OutputSchemaError in project cdap by caskdata.

the class JoinDefinitionTest method testBadOutputSchema.

@Test
public void testBadOutputSchema() {
    JoinStage purchases = JoinStage.builder("purchases", PURCHASE_SCHEMA).build();
    JoinStage users = JoinStage.builder("users", USER_SCHEMA).isOptional().build();
    /*
         things wrong with the schema:

         missing 'price' field
         extra 'pricee' field
         'coupon' should be nullable
         'email' is the wrong type
     */
    Schema badSchema = Schema.recordOf("joined", Schema.Field.of("ts", Schema.of(Schema.LogicalType.TIMESTAMP_MICROS)), Schema.Field.of("purchase_id", Schema.of(Schema.Type.LONG)), Schema.Field.of("user_id", Schema.nullableOf(Schema.of(Schema.Type.LONG))), Schema.Field.of("pricee", Schema.of(Schema.Type.DOUBLE)), Schema.Field.of("coupon", Schema.of(Schema.Type.BOOLEAN)), Schema.Field.of("name", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("email", Schema.nullableOf(Schema.of(Schema.Type.FLOAT))), Schema.Field.of("age", Schema.nullableOf(Schema.of(Schema.Type.INT))), Schema.Field.of("bday", Schema.nullableOf(Schema.of(Schema.LogicalType.DATE))));
    try {
        JoinDefinition.builder().select(new JoinField("purchases", "id", "purchase_id"), new JoinField("users", "id", "user_id"), new JoinField("purchases", "ts"), new JoinField("purchases", "price"), new JoinField("purchases", "coupon"), new JoinField("users", "name"), new JoinField("users", "email"), new JoinField("users", "age"), new JoinField("users", "bday")).from(purchases, users).on(JoinCondition.onKeys().addKey(new JoinKey("purchases", Collections.singletonList("user_id"))).addKey(new JoinKey("users", Collections.singletonList("id"))).build()).setOutputSchema(badSchema).build();
        Assert.fail("Invalid output schema did not fail as expected");
    } catch (InvalidJoinException e) {
        // expected
        Collection<JoinError> errors = e.getErrors();
        Assert.assertEquals(4, errors.size());
        Map<String, String> expected = new HashMap<>();
        expected.put("pricee", null);
        expected.put("coupon", "boolean");
        expected.put("email", "string");
        Map<String, String> badFields = new HashMap<>();
        for (JoinError joinError : errors) {
            if (joinError.getType() != JoinError.Type.OUTPUT_SCHEMA) {
                // this is the error about one of the selected fields missing from the output schema
                Assert.assertEquals(JoinError.Type.GENERAL, joinError.getType());
                continue;
            }
            OutputSchemaError outputSchemaError = (OutputSchemaError) joinError;
            badFields.put(outputSchemaError.getField(), outputSchemaError.getExpectedType());
        }
        Assert.assertEquals(expected, badFields);
    }
}
Also used : JoinError(io.cdap.cdap.etl.api.join.error.JoinError) Schema(io.cdap.cdap.api.data.schema.Schema) Collection(java.util.Collection) OutputSchemaError(io.cdap.cdap.etl.api.join.error.OutputSchemaError) HashMap(java.util.HashMap) Map(java.util.Map) Test(org.junit.Test)

Example 2 with OutputSchemaError

use of io.cdap.cdap.etl.api.join.error.OutputSchemaError in project cdap by cdapio.

the class JoinDefinitionTest method testBadOutputSchema.

@Test
public void testBadOutputSchema() {
    JoinStage purchases = JoinStage.builder("purchases", PURCHASE_SCHEMA).build();
    JoinStage users = JoinStage.builder("users", USER_SCHEMA).isOptional().build();
    /*
         things wrong with the schema:

         missing 'price' field
         extra 'pricee' field
         'coupon' should be nullable
         'email' is the wrong type
     */
    Schema badSchema = Schema.recordOf("joined", Schema.Field.of("ts", Schema.of(Schema.LogicalType.TIMESTAMP_MICROS)), Schema.Field.of("purchase_id", Schema.of(Schema.Type.LONG)), Schema.Field.of("user_id", Schema.nullableOf(Schema.of(Schema.Type.LONG))), Schema.Field.of("pricee", Schema.of(Schema.Type.DOUBLE)), Schema.Field.of("coupon", Schema.of(Schema.Type.BOOLEAN)), Schema.Field.of("name", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("email", Schema.nullableOf(Schema.of(Schema.Type.FLOAT))), Schema.Field.of("age", Schema.nullableOf(Schema.of(Schema.Type.INT))), Schema.Field.of("bday", Schema.nullableOf(Schema.of(Schema.LogicalType.DATE))));
    try {
        JoinDefinition.builder().select(new JoinField("purchases", "id", "purchase_id"), new JoinField("users", "id", "user_id"), new JoinField("purchases", "ts"), new JoinField("purchases", "price"), new JoinField("purchases", "coupon"), new JoinField("users", "name"), new JoinField("users", "email"), new JoinField("users", "age"), new JoinField("users", "bday")).from(purchases, users).on(JoinCondition.onKeys().addKey(new JoinKey("purchases", Collections.singletonList("user_id"))).addKey(new JoinKey("users", Collections.singletonList("id"))).build()).setOutputSchema(badSchema).build();
        Assert.fail("Invalid output schema did not fail as expected");
    } catch (InvalidJoinException e) {
        // expected
        Collection<JoinError> errors = e.getErrors();
        Assert.assertEquals(4, errors.size());
        Map<String, String> expected = new HashMap<>();
        expected.put("pricee", null);
        expected.put("coupon", "boolean");
        expected.put("email", "string");
        Map<String, String> badFields = new HashMap<>();
        for (JoinError joinError : errors) {
            if (joinError.getType() != JoinError.Type.OUTPUT_SCHEMA) {
                // this is the error about one of the selected fields missing from the output schema
                Assert.assertEquals(JoinError.Type.GENERAL, joinError.getType());
                continue;
            }
            OutputSchemaError outputSchemaError = (OutputSchemaError) joinError;
            badFields.put(outputSchemaError.getField(), outputSchemaError.getExpectedType());
        }
        Assert.assertEquals(expected, badFields);
    }
}
Also used : JoinError(io.cdap.cdap.etl.api.join.error.JoinError) Schema(io.cdap.cdap.api.data.schema.Schema) Collection(java.util.Collection) OutputSchemaError(io.cdap.cdap.etl.api.join.error.OutputSchemaError) HashMap(java.util.HashMap) Map(java.util.Map) Test(org.junit.Test)

Example 3 with OutputSchemaError

use of io.cdap.cdap.etl.api.join.error.OutputSchemaError in project hydrator-plugins by cdapio.

the class Joiner method define.

@Nullable
@Override
public JoinDefinition define(AutoJoinerContext context) {
    FailureCollector collector = context.getFailureCollector();
    boolean hasUnknownInputSchema = context.getInputStages().values().stream().anyMatch(Objects::isNull);
    if (hasUnknownInputSchema && !conf.containsMacro(JoinerConfig.OUTPUT_SCHEMA) && conf.getOutputSchema(collector) == null) {
        // If input schemas are unknown, an output schema must be provided.
        collector.addFailure("Output schema must be specified", null).withConfigProperty(JoinerConfig.OUTPUT_SCHEMA);
    }
    if (conf.requiredPropertiesContainMacros()) {
        return null;
    }
    Set<String> requiredStages = conf.getRequiredInputs();
    Set<String> broadcastStages = conf.getBroadcastInputs();
    List<JoinStage> inputs = new ArrayList<>(context.getInputStages().size());
    boolean useOutputSchema = false;
    for (JoinStage joinStage : context.getInputStages().values()) {
        inputs.add(JoinStage.builder(joinStage).setRequired(requiredStages.contains(joinStage.getStageName())).setBroadcast(broadcastStages.contains(joinStage.getStageName())).build());
        useOutputSchema = useOutputSchema || joinStage.getSchema() == null;
    }
    JoinCondition condition = conf.getCondition(collector);
    if (condition.getOp() == JoinCondition.Op.EXPRESSION) {
        if (inputs.size() != 2) {
            collector.addFailure("Advanced join conditions can only be used when there are two inputs.", null).withConfigProperty(JoinerConfig.CONDITION_TYPE);
            throw collector.getOrThrowException();
        }
        /*
         If this is an outer join of some kind and it is not a broadcast join, add a failure.
         this is because any outer join that is not an equality join in Spark will get turned into
         a BroadcastNestedLoopJoin anyway. So it is better to make that behavior explicit to the user
         and force them to specify which side should be broadcast. This also prevents problems where
         Spark will just choose to broadcast the right side because it doesn't know how big the input datasets are.
         See CDAP-17718 for more info.
       */
        if (requiredStages.size() < inputs.size() && broadcastStages.isEmpty()) {
            collector.addFailure("Advanced outer joins must specify an input to load in memory.", null).withConfigProperty(JoinerConfig.MEMORY_INPUTS);
        }
    }
    // Validate Join Left Side property
    if (!conf.mostSkewedInputContainsMacro() && !Strings.isNullOrEmpty(conf.getMostSkewedInput()) && inputs.stream().map(JoinStage::getStageName).noneMatch(sn -> Objects.equals(sn, conf.getMostSkewedInput()))) {
        collector.addFailure("Only one stage can be specified as the stage with the larger skew.", "Please select only one stage.").withConfigProperty(JoinerConfig.MOST_SKEWED_INPUT);
    }
    try {
        JoinDefinition.Builder joinBuilder = JoinDefinition.builder();
        // always first.
        if (!conf.mostSkewedInputContainsMacro() && !Strings.isNullOrEmpty(conf.getMostSkewedInput())) {
            reorderJoinStages(inputs, conf.getMostSkewedInput());
        }
        joinBuilder.select(conf.getSelectedFields(collector)).from(inputs).on(condition);
        if (useOutputSchema) {
            joinBuilder.setOutputSchema(conf.getOutputSchema(collector));
        } else {
            joinBuilder.setOutputSchemaName("join.output");
        }
        if (conf.isDistributionValid(collector)) {
            joinBuilder.setDistributionFactor(conf.getDistributionFactor(), conf.getDistributionStageName());
        }
        return joinBuilder.build();
    } catch (InvalidJoinException e) {
        if (e.getErrors().isEmpty()) {
            collector.addFailure(e.getMessage(), null);
        }
        for (JoinError error : e.getErrors()) {
            ValidationFailure failure = collector.addFailure(error.getMessage(), error.getCorrectiveAction());
            switch(error.getType()) {
                case JOIN_KEY:
                case JOIN_KEY_FIELD:
                    failure.withConfigProperty(JoinerConfig.JOIN_KEYS);
                    break;
                case SELECTED_FIELD:
                    JoinField badField = ((SelectedFieldError) error).getField();
                    failure.withConfigElement(JoinerConfig.SELECTED_FIELDS, String.format("%s.%s as %s", badField.getStageName(), badField.getFieldName(), badField.getAlias()));
                    break;
                case OUTPUT_SCHEMA:
                    OutputSchemaError schemaError = (OutputSchemaError) error;
                    failure.withOutputSchemaField(schemaError.getField());
                    break;
                case DISTRIBUTION_SIZE:
                    failure.withConfigProperty(JoinerConfig.DISTRIBUTION_FACTOR);
                    break;
                case DISTRIBUTION_STAGE:
                    failure.withConfigProperty(JoinerConfig.DISTRIBUTION_STAGE);
                    break;
                case BROADCAST:
                    failure.withConfigProperty(JoinerConfig.MEMORY_INPUTS);
                    break;
                case INVALID_CONDITION:
                    failure.withConfigProperty(JoinerConfig.CONDITION_EXPR);
            }
        }
        throw collector.getOrThrowException();
    }
}
Also used : FieldOperation(io.cdap.cdap.etl.api.lineage.field.FieldOperation) Description(io.cdap.cdap.api.annotation.Description) BatchJoinerContext(io.cdap.cdap.etl.api.batch.BatchJoinerContext) LoggerFactory(org.slf4j.LoggerFactory) FailureCollector(io.cdap.cdap.etl.api.FailureCollector) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) Strings(com.google.common.base.Strings) Name(io.cdap.cdap.api.annotation.Name) Map(java.util.Map) InvalidJoinException(io.cdap.cdap.etl.api.join.InvalidJoinException) BatchAutoJoiner(io.cdap.cdap.etl.api.batch.BatchAutoJoiner) LinkedList(java.util.LinkedList) LinkedHashSet(java.util.LinkedHashSet) Nullable(javax.annotation.Nullable) BatchJoiner(io.cdap.cdap.etl.api.batch.BatchJoiner) JoinError(io.cdap.cdap.etl.api.join.error.JoinError) FieldTransformOperation(io.cdap.cdap.etl.api.lineage.field.FieldTransformOperation) Logger(org.slf4j.Logger) JoinField(io.cdap.cdap.etl.api.join.JoinField) JoinStage(io.cdap.cdap.etl.api.join.JoinStage) Set(java.util.Set) Plugin(io.cdap.cdap.api.annotation.Plugin) Collectors(java.util.stream.Collectors) SelectedFieldError(io.cdap.cdap.etl.api.join.error.SelectedFieldError) JoinKey(io.cdap.cdap.etl.api.join.JoinKey) Objects(java.util.Objects) List(java.util.List) ValidationFailure(io.cdap.cdap.etl.api.validation.ValidationFailure) OutputSchemaError(io.cdap.cdap.etl.api.join.error.OutputSchemaError) JoinDefinition(io.cdap.cdap.etl.api.join.JoinDefinition) AutoJoinerContext(io.cdap.cdap.etl.api.join.AutoJoinerContext) VisibleForTesting(com.google.common.annotations.VisibleForTesting) JoinCondition(io.cdap.cdap.etl.api.join.JoinCondition) Collections(java.util.Collections) JoinStage(io.cdap.cdap.etl.api.join.JoinStage) ArrayList(java.util.ArrayList) InvalidJoinException(io.cdap.cdap.etl.api.join.InvalidJoinException) JoinField(io.cdap.cdap.etl.api.join.JoinField) OutputSchemaError(io.cdap.cdap.etl.api.join.error.OutputSchemaError) JoinCondition(io.cdap.cdap.etl.api.join.JoinCondition) ValidationFailure(io.cdap.cdap.etl.api.validation.ValidationFailure) JoinError(io.cdap.cdap.etl.api.join.error.JoinError) JoinDefinition(io.cdap.cdap.etl.api.join.JoinDefinition) Objects(java.util.Objects) FailureCollector(io.cdap.cdap.etl.api.FailureCollector) Nullable(javax.annotation.Nullable)

Aggregations

JoinError (io.cdap.cdap.etl.api.join.error.JoinError)3 OutputSchemaError (io.cdap.cdap.etl.api.join.error.OutputSchemaError)3 Map (java.util.Map)3 Schema (io.cdap.cdap.api.data.schema.Schema)2 Collection (java.util.Collection)2 HashMap (java.util.HashMap)2 Test (org.junit.Test)2 VisibleForTesting (com.google.common.annotations.VisibleForTesting)1 Strings (com.google.common.base.Strings)1 Description (io.cdap.cdap.api.annotation.Description)1 Name (io.cdap.cdap.api.annotation.Name)1 Plugin (io.cdap.cdap.api.annotation.Plugin)1 FailureCollector (io.cdap.cdap.etl.api.FailureCollector)1 BatchAutoJoiner (io.cdap.cdap.etl.api.batch.BatchAutoJoiner)1 BatchJoiner (io.cdap.cdap.etl.api.batch.BatchJoiner)1 BatchJoinerContext (io.cdap.cdap.etl.api.batch.BatchJoinerContext)1 AutoJoinerContext (io.cdap.cdap.etl.api.join.AutoJoinerContext)1 InvalidJoinException (io.cdap.cdap.etl.api.join.InvalidJoinException)1 JoinCondition (io.cdap.cdap.etl.api.join.JoinCondition)1 JoinDefinition (io.cdap.cdap.etl.api.join.JoinDefinition)1