Search in sources :

Example 1 with JoinError

use of io.cdap.cdap.etl.api.join.error.JoinError in project cdap by caskdata.

the class JoinDefinitionTest method testBadOutputSchema.

@Test
public void testBadOutputSchema() {
    JoinStage purchases = JoinStage.builder("purchases", PURCHASE_SCHEMA).build();
    JoinStage users = JoinStage.builder("users", USER_SCHEMA).isOptional().build();
    /*
         things wrong with the schema:

         missing 'price' field
         extra 'pricee' field
         'coupon' should be nullable
         'email' is the wrong type
     */
    Schema badSchema = Schema.recordOf("joined", Schema.Field.of("ts", Schema.of(Schema.LogicalType.TIMESTAMP_MICROS)), Schema.Field.of("purchase_id", Schema.of(Schema.Type.LONG)), Schema.Field.of("user_id", Schema.nullableOf(Schema.of(Schema.Type.LONG))), Schema.Field.of("pricee", Schema.of(Schema.Type.DOUBLE)), Schema.Field.of("coupon", Schema.of(Schema.Type.BOOLEAN)), Schema.Field.of("name", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("email", Schema.nullableOf(Schema.of(Schema.Type.FLOAT))), Schema.Field.of("age", Schema.nullableOf(Schema.of(Schema.Type.INT))), Schema.Field.of("bday", Schema.nullableOf(Schema.of(Schema.LogicalType.DATE))));
    try {
        JoinDefinition.builder().select(new JoinField("purchases", "id", "purchase_id"), new JoinField("users", "id", "user_id"), new JoinField("purchases", "ts"), new JoinField("purchases", "price"), new JoinField("purchases", "coupon"), new JoinField("users", "name"), new JoinField("users", "email"), new JoinField("users", "age"), new JoinField("users", "bday")).from(purchases, users).on(JoinCondition.onKeys().addKey(new JoinKey("purchases", Collections.singletonList("user_id"))).addKey(new JoinKey("users", Collections.singletonList("id"))).build()).setOutputSchema(badSchema).build();
        Assert.fail("Invalid output schema did not fail as expected");
    } catch (InvalidJoinException e) {
        // expected
        Collection<JoinError> errors = e.getErrors();
        Assert.assertEquals(4, errors.size());
        Map<String, String> expected = new HashMap<>();
        expected.put("pricee", null);
        expected.put("coupon", "boolean");
        expected.put("email", "string");
        Map<String, String> badFields = new HashMap<>();
        for (JoinError joinError : errors) {
            if (joinError.getType() != JoinError.Type.OUTPUT_SCHEMA) {
                // this is the error about one of the selected fields missing from the output schema
                Assert.assertEquals(JoinError.Type.GENERAL, joinError.getType());
                continue;
            }
            OutputSchemaError outputSchemaError = (OutputSchemaError) joinError;
            badFields.put(outputSchemaError.getField(), outputSchemaError.getExpectedType());
        }
        Assert.assertEquals(expected, badFields);
    }
}
Also used : JoinError(io.cdap.cdap.etl.api.join.error.JoinError) Schema(io.cdap.cdap.api.data.schema.Schema) Collection(java.util.Collection) OutputSchemaError(io.cdap.cdap.etl.api.join.error.OutputSchemaError) HashMap(java.util.HashMap) Map(java.util.Map) Test(org.junit.Test)

Example 2 with JoinError

use of io.cdap.cdap.etl.api.join.error.JoinError in project cdap by caskdata.

the class JoinDistribution method validate.

public Collection<JoinError> validate(List<JoinStage> stages) {
    List<JoinError> errors = new ArrayList<>();
    if (stages.size() > 2) {
        errors.add(new JoinError("Only two stages can be joined if a distribution factor is specified"));
    }
    if (skewedStageName == null) {
        errors.add(new DistributionStageError("Distribution requires skewed stage name to be defined"));
    }
    if (distributionFactor < 1) {
        errors.add(new DistributionSizeError("Distribution size must be greater than 0"));
    }
    // If skewedStageName does not match any of the names in stages
    JoinStage leftStage = stages.stream().filter(s -> s.getStageName().equals(skewedStageName)).findFirst().orElse(null);
    if (leftStage == null) {
        errors.add(new DistributionStageError(String.format("Skewed stage '%s' does not match any of the specified " + "stages", skewedStageName)));
    } else if (!leftStage.isRequired()) {
        errors.add(new DistributionStageError(String.format("Distribution only supports inner or left outer joins, the skewed " + "stage '%s' must be required", skewedStageName)));
    }
    if (stages.stream().anyMatch(JoinStage::isBroadcast)) {
        errors.add(new BroadcastError("Distribution cannot be used if either stage will be broadcast"));
    }
    return errors;
}
Also used : JoinError(io.cdap.cdap.etl.api.join.error.JoinError) ArrayList(java.util.ArrayList) BroadcastError(io.cdap.cdap.etl.api.join.error.BroadcastError) DistributionStageError(io.cdap.cdap.etl.api.join.error.DistributionStageError) DistributionSizeError(io.cdap.cdap.etl.api.join.error.DistributionSizeError)

Example 3 with JoinError

use of io.cdap.cdap.etl.api.join.error.JoinError in project cdap by cdapio.

the class JoinDistribution method validate.

public Collection<JoinError> validate(List<JoinStage> stages) {
    List<JoinError> errors = new ArrayList<>();
    if (stages.size() > 2) {
        errors.add(new JoinError("Only two stages can be joined if a distribution factor is specified"));
    }
    if (skewedStageName == null) {
        errors.add(new DistributionStageError("Distribution requires skewed stage name to be defined"));
    }
    if (distributionFactor < 1) {
        errors.add(new DistributionSizeError("Distribution size must be greater than 0"));
    }
    // If skewedStageName does not match any of the names in stages
    JoinStage leftStage = stages.stream().filter(s -> s.getStageName().equals(skewedStageName)).findFirst().orElse(null);
    if (leftStage == null) {
        errors.add(new DistributionStageError(String.format("Skewed stage '%s' does not match any of the specified " + "stages", skewedStageName)));
    } else if (!leftStage.isRequired()) {
        errors.add(new DistributionStageError(String.format("Distribution only supports inner or left outer joins, the skewed " + "stage '%s' must be required", skewedStageName)));
    }
    if (stages.stream().anyMatch(JoinStage::isBroadcast)) {
        errors.add(new BroadcastError("Distribution cannot be used if either stage will be broadcast"));
    }
    return errors;
}
Also used : JoinError(io.cdap.cdap.etl.api.join.error.JoinError) ArrayList(java.util.ArrayList) BroadcastError(io.cdap.cdap.etl.api.join.error.BroadcastError) DistributionStageError(io.cdap.cdap.etl.api.join.error.DistributionStageError) DistributionSizeError(io.cdap.cdap.etl.api.join.error.DistributionSizeError)

Example 4 with JoinError

use of io.cdap.cdap.etl.api.join.error.JoinError in project cdap by caskdata.

the class InvalidJoinException method getMessage.

private static String getMessage(Collection<JoinError> errors) {
    if (errors.isEmpty()) {
        throw new IllegalStateException("An invalid join must contain at least one error, " + "or it must provide an error message.");
    }
    JoinError error = errors.iterator().next();
    String message = error.getMessage();
    return String.format("%s%s %s", message, message.endsWith(".") ? "" : ".", error.getCorrectiveAction());
}
Also used : JoinError(io.cdap.cdap.etl.api.join.error.JoinError)

Example 5 with JoinError

use of io.cdap.cdap.etl.api.join.error.JoinError in project cdap by cdapio.

the class JoinDefinitionTest method testBadOutputSchema.

@Test
public void testBadOutputSchema() {
    JoinStage purchases = JoinStage.builder("purchases", PURCHASE_SCHEMA).build();
    JoinStage users = JoinStage.builder("users", USER_SCHEMA).isOptional().build();
    /*
         things wrong with the schema:

         missing 'price' field
         extra 'pricee' field
         'coupon' should be nullable
         'email' is the wrong type
     */
    Schema badSchema = Schema.recordOf("joined", Schema.Field.of("ts", Schema.of(Schema.LogicalType.TIMESTAMP_MICROS)), Schema.Field.of("purchase_id", Schema.of(Schema.Type.LONG)), Schema.Field.of("user_id", Schema.nullableOf(Schema.of(Schema.Type.LONG))), Schema.Field.of("pricee", Schema.of(Schema.Type.DOUBLE)), Schema.Field.of("coupon", Schema.of(Schema.Type.BOOLEAN)), Schema.Field.of("name", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("email", Schema.nullableOf(Schema.of(Schema.Type.FLOAT))), Schema.Field.of("age", Schema.nullableOf(Schema.of(Schema.Type.INT))), Schema.Field.of("bday", Schema.nullableOf(Schema.of(Schema.LogicalType.DATE))));
    try {
        JoinDefinition.builder().select(new JoinField("purchases", "id", "purchase_id"), new JoinField("users", "id", "user_id"), new JoinField("purchases", "ts"), new JoinField("purchases", "price"), new JoinField("purchases", "coupon"), new JoinField("users", "name"), new JoinField("users", "email"), new JoinField("users", "age"), new JoinField("users", "bday")).from(purchases, users).on(JoinCondition.onKeys().addKey(new JoinKey("purchases", Collections.singletonList("user_id"))).addKey(new JoinKey("users", Collections.singletonList("id"))).build()).setOutputSchema(badSchema).build();
        Assert.fail("Invalid output schema did not fail as expected");
    } catch (InvalidJoinException e) {
        // expected
        Collection<JoinError> errors = e.getErrors();
        Assert.assertEquals(4, errors.size());
        Map<String, String> expected = new HashMap<>();
        expected.put("pricee", null);
        expected.put("coupon", "boolean");
        expected.put("email", "string");
        Map<String, String> badFields = new HashMap<>();
        for (JoinError joinError : errors) {
            if (joinError.getType() != JoinError.Type.OUTPUT_SCHEMA) {
                // this is the error about one of the selected fields missing from the output schema
                Assert.assertEquals(JoinError.Type.GENERAL, joinError.getType());
                continue;
            }
            OutputSchemaError outputSchemaError = (OutputSchemaError) joinError;
            badFields.put(outputSchemaError.getField(), outputSchemaError.getExpectedType());
        }
        Assert.assertEquals(expected, badFields);
    }
}
Also used : JoinError(io.cdap.cdap.etl.api.join.error.JoinError) Schema(io.cdap.cdap.api.data.schema.Schema) Collection(java.util.Collection) OutputSchemaError(io.cdap.cdap.etl.api.join.error.OutputSchemaError) HashMap(java.util.HashMap) Map(java.util.Map) Test(org.junit.Test)

Aggregations

JoinError (io.cdap.cdap.etl.api.join.error.JoinError)7 OutputSchemaError (io.cdap.cdap.etl.api.join.error.OutputSchemaError)3 ArrayList (java.util.ArrayList)3 Map (java.util.Map)3 Schema (io.cdap.cdap.api.data.schema.Schema)2 BroadcastError (io.cdap.cdap.etl.api.join.error.BroadcastError)2 DistributionSizeError (io.cdap.cdap.etl.api.join.error.DistributionSizeError)2 DistributionStageError (io.cdap.cdap.etl.api.join.error.DistributionStageError)2 Collection (java.util.Collection)2 HashMap (java.util.HashMap)2 Test (org.junit.Test)2 VisibleForTesting (com.google.common.annotations.VisibleForTesting)1 Strings (com.google.common.base.Strings)1 Description (io.cdap.cdap.api.annotation.Description)1 Name (io.cdap.cdap.api.annotation.Name)1 Plugin (io.cdap.cdap.api.annotation.Plugin)1 FailureCollector (io.cdap.cdap.etl.api.FailureCollector)1 BatchAutoJoiner (io.cdap.cdap.etl.api.batch.BatchAutoJoiner)1 BatchJoiner (io.cdap.cdap.etl.api.batch.BatchJoiner)1 BatchJoinerContext (io.cdap.cdap.etl.api.batch.BatchJoinerContext)1