Search in sources :

Example 21 with JoinStage

use of io.cdap.cdap.etl.api.join.JoinStage in project hydrator-plugins by cdapio.

the class JoinerConfigTest method testAdvancedOuterRequiresBroadcast.

@Test
public void testAdvancedOuterRequiresBroadcast() {
    JoinerConfig conf = new JoinerConfig("users.id, emails.email", "users.id = emails.userid", Collections.singleton("users"));
    Joiner joiner = new Joiner(conf);
    FailureCollector collector = new MockFailureCollector();
    Schema userSchema = Schema.recordOf("user", Schema.Field.of("id", Schema.of(Schema.Type.INT)));
    Schema emailSchema = Schema.recordOf("email", Schema.Field.of("email", Schema.of(Schema.Type.STRING)), Schema.Field.of("userid", Schema.of(Schema.Type.INT)));
    Map<String, JoinStage> inputStages = new HashMap<>();
    inputStages.put("users", JoinStage.builder("users", userSchema).build());
    inputStages.put("emails", JoinStage.builder("emails", emailSchema).build());
    inputStages.put("users2", JoinStage.builder("users2", userSchema).build());
    AutoJoinerContext autoJoinerContext = new MockAutoJoinerContext(inputStages, collector);
    try {
        joiner.define(autoJoinerContext);
        Assert.fail("Advanced left outer join without broadcast did not fail as expected.");
    } catch (ValidationException e) {
        List<ValidationFailure> failures = e.getFailures();
        Assert.assertEquals(1, failures.size());
        List<ValidationFailure.Cause> causes = failures.get(0).getCauses();
        Assert.assertEquals(1, causes.size());
        Assert.assertEquals(JoinerConfig.CONDITION_TYPE, causes.get(0).getAttribute(CauseAttributes.STAGE_CONFIG));
    }
}
Also used : JoinStage(io.cdap.cdap.etl.api.join.JoinStage) ValidationException(io.cdap.cdap.etl.api.validation.ValidationException) HashMap(java.util.HashMap) Schema(io.cdap.cdap.api.data.schema.Schema) ValidationFailure(io.cdap.cdap.etl.api.validation.ValidationFailure) AutoJoinerContext(io.cdap.cdap.etl.api.join.AutoJoinerContext) MockFailureCollector(io.cdap.cdap.etl.mock.validation.MockFailureCollector) List(java.util.List) FailureCollector(io.cdap.cdap.etl.api.FailureCollector) MockFailureCollector(io.cdap.cdap.etl.mock.validation.MockFailureCollector) Test(org.junit.Test)

Example 22 with JoinStage

use of io.cdap.cdap.etl.api.join.JoinStage in project hydrator-plugins by cdapio.

the class JoinerConfigTest method testAdvancedJoinCondition.

@Test
public void testAdvancedJoinCondition() {
    JoinerConfig conf = new JoinerConfig("users.id, emails.email", "users.id = emails.userid", new HashSet<>(Arrays.asList("users", "emails")));
    Joiner joiner = new Joiner(conf);
    FailureCollector collector = new MockFailureCollector();
    Schema userSchema = Schema.recordOf("user", Schema.Field.of("id", Schema.of(Schema.Type.INT)));
    Schema emailSchema = Schema.recordOf("email", Schema.Field.of("email", Schema.of(Schema.Type.STRING)), Schema.Field.of("userid", Schema.of(Schema.Type.INT)));
    Map<String, JoinStage> inputStages = new HashMap<>();
    inputStages.put("users", JoinStage.builder("users", userSchema).build());
    inputStages.put("emails", JoinStage.builder("emails", emailSchema).build());
    AutoJoinerContext autoJoinerContext = new MockAutoJoinerContext(inputStages, collector);
    JoinDefinition joinDefinition = joiner.define(autoJoinerContext);
    JoinCondition condition = joinDefinition.getCondition();
    Assert.assertEquals(JoinCondition.Op.EXPRESSION, condition.getOp());
    Assert.assertEquals("users.id = emails.userid", ((JoinCondition.OnExpression) condition).getExpression());
}
Also used : JoinStage(io.cdap.cdap.etl.api.join.JoinStage) HashMap(java.util.HashMap) Schema(io.cdap.cdap.api.data.schema.Schema) JoinCondition(io.cdap.cdap.etl.api.join.JoinCondition) AutoJoinerContext(io.cdap.cdap.etl.api.join.AutoJoinerContext) JoinDefinition(io.cdap.cdap.etl.api.join.JoinDefinition) MockFailureCollector(io.cdap.cdap.etl.mock.validation.MockFailureCollector) FailureCollector(io.cdap.cdap.etl.api.FailureCollector) MockFailureCollector(io.cdap.cdap.etl.mock.validation.MockFailureCollector) Test(org.junit.Test)

Example 23 with JoinStage

use of io.cdap.cdap.etl.api.join.JoinStage in project hydrator-plugins by cdapio.

the class JoinerConfigTest method testOutputSchemaForInvalidKeys.

@Test
public void testOutputSchemaForInvalidKeys() {
    // film_id is Long but it should be String, OutputSchema call should throw an exception
    Schema filmCategorySchema = Schema.recordOf("filmCategory", Schema.Field.of("film_id", Schema.of(Schema.Type.LONG)), Schema.Field.of("film_name", Schema.of(Schema.Type.STRING)), Schema.Field.of("category_name", Schema.of(Schema.Type.STRING)));
    String joinKeys = "film.film_id=filmActor.film_id=filmCategory.film_id";
    String selectedFields = "film.film_id, film.film_name, filmActor.actor_name as renamed_actor, " + "filmCategory.category_name as renamed_category";
    String requiredInputs = "film,filmActor,filmCategory";
    JoinerConfig config = new JoinerConfig(joinKeys, selectedFields, requiredInputs);
    Joiner joiner = new Joiner(config);
    FailureCollector collector = new MockFailureCollector();
    Map<String, JoinStage> inputStages = new HashMap<>();
    inputStages.put("film", JoinStage.builder("film", FILM_SCHEMA).build());
    inputStages.put("filmActor", JoinStage.builder("filmActor", FILM_ACTOR_SCHEMA).build());
    inputStages.put("filmCategory", JoinStage.builder("filmCategory", filmCategorySchema).build());
    AutoJoinerContext autoJoinerContext = new MockAutoJoinerContext(inputStages, collector);
    try {
        joiner.define(autoJoinerContext);
        Assert.fail();
    } catch (ValidationException e) {
        Assert.assertEquals(2, e.getFailures().size());
        // Assert first failure
        Assert.assertEquals(1, e.getFailures().get(0).getCauses().size());
        Cause expectedCause = new Cause();
        expectedCause.addAttribute(CauseAttributes.STAGE_CONFIG, JoinerConfig.JOIN_KEYS);
        expectedCause.addAttribute(STAGE, MOCK_STAGE);
        Assert.assertEquals(expectedCause, e.getFailures().get(0).getCauses().get(0));
        // Assert second failure
        Assert.assertEquals(1, e.getFailures().get(1).getCauses().size());
        Assert.assertEquals(expectedCause, e.getFailures().get(1).getCauses().get(0));
    }
}
Also used : JoinStage(io.cdap.cdap.etl.api.join.JoinStage) ValidationException(io.cdap.cdap.etl.api.validation.ValidationException) HashMap(java.util.HashMap) Schema(io.cdap.cdap.api.data.schema.Schema) AutoJoinerContext(io.cdap.cdap.etl.api.join.AutoJoinerContext) Cause(io.cdap.cdap.etl.api.validation.ValidationFailure.Cause) MockFailureCollector(io.cdap.cdap.etl.mock.validation.MockFailureCollector) FailureCollector(io.cdap.cdap.etl.api.FailureCollector) MockFailureCollector(io.cdap.cdap.etl.mock.validation.MockFailureCollector) Test(org.junit.Test)

Example 24 with JoinStage

use of io.cdap.cdap.etl.api.join.JoinStage in project hydrator-plugins by cdapio.

the class Joiner method define.

@Nullable
@Override
public JoinDefinition define(AutoJoinerContext context) {
    FailureCollector collector = context.getFailureCollector();
    boolean hasUnknownInputSchema = context.getInputStages().values().stream().anyMatch(Objects::isNull);
    if (hasUnknownInputSchema && !conf.containsMacro(JoinerConfig.OUTPUT_SCHEMA) && conf.getOutputSchema(collector) == null) {
        // If input schemas are unknown, an output schema must be provided.
        collector.addFailure("Output schema must be specified", null).withConfigProperty(JoinerConfig.OUTPUT_SCHEMA);
    }
    if (conf.requiredPropertiesContainMacros()) {
        return null;
    }
    Set<String> requiredStages = conf.getRequiredInputs();
    Set<String> broadcastStages = conf.getBroadcastInputs();
    List<JoinStage> inputs = new ArrayList<>(context.getInputStages().size());
    boolean useOutputSchema = false;
    for (JoinStage joinStage : context.getInputStages().values()) {
        inputs.add(JoinStage.builder(joinStage).setRequired(requiredStages.contains(joinStage.getStageName())).setBroadcast(broadcastStages.contains(joinStage.getStageName())).build());
        useOutputSchema = useOutputSchema || joinStage.getSchema() == null;
    }
    JoinCondition condition = conf.getCondition(collector);
    if (condition.getOp() == JoinCondition.Op.EXPRESSION) {
        if (inputs.size() != 2) {
            collector.addFailure("Advanced join conditions can only be used when there are two inputs.", null).withConfigProperty(JoinerConfig.CONDITION_TYPE);
            throw collector.getOrThrowException();
        }
        /*
         If this is an outer join of some kind and it is not a broadcast join, add a failure.
         this is because any outer join that is not an equality join in Spark will get turned into
         a BroadcastNestedLoopJoin anyway. So it is better to make that behavior explicit to the user
         and force them to specify which side should be broadcast. This also prevents problems where
         Spark will just choose to broadcast the right side because it doesn't know how big the input datasets are.
         See CDAP-17718 for more info.
       */
        if (requiredStages.size() < inputs.size() && broadcastStages.isEmpty()) {
            collector.addFailure("Advanced outer joins must specify an input to load in memory.", null).withConfigProperty(JoinerConfig.MEMORY_INPUTS);
        }
    }
    // Validate Join Left Side property
    if (!conf.mostSkewedInputContainsMacro() && !Strings.isNullOrEmpty(conf.getMostSkewedInput()) && inputs.stream().map(JoinStage::getStageName).noneMatch(sn -> Objects.equals(sn, conf.getMostSkewedInput()))) {
        collector.addFailure("Only one stage can be specified as the stage with the larger skew.", "Please select only one stage.").withConfigProperty(JoinerConfig.MOST_SKEWED_INPUT);
    }
    try {
        JoinDefinition.Builder joinBuilder = JoinDefinition.builder();
        // always first.
        if (!conf.mostSkewedInputContainsMacro() && !Strings.isNullOrEmpty(conf.getMostSkewedInput())) {
            reorderJoinStages(inputs, conf.getMostSkewedInput());
        }
        joinBuilder.select(conf.getSelectedFields(collector)).from(inputs).on(condition);
        if (useOutputSchema) {
            joinBuilder.setOutputSchema(conf.getOutputSchema(collector));
        } else {
            joinBuilder.setOutputSchemaName("join.output");
        }
        if (conf.isDistributionValid(collector)) {
            joinBuilder.setDistributionFactor(conf.getDistributionFactor(), conf.getDistributionStageName());
        }
        return joinBuilder.build();
    } catch (InvalidJoinException e) {
        if (e.getErrors().isEmpty()) {
            collector.addFailure(e.getMessage(), null);
        }
        for (JoinError error : e.getErrors()) {
            ValidationFailure failure = collector.addFailure(error.getMessage(), error.getCorrectiveAction());
            switch(error.getType()) {
                case JOIN_KEY:
                case JOIN_KEY_FIELD:
                    failure.withConfigProperty(JoinerConfig.JOIN_KEYS);
                    break;
                case SELECTED_FIELD:
                    JoinField badField = ((SelectedFieldError) error).getField();
                    failure.withConfigElement(JoinerConfig.SELECTED_FIELDS, String.format("%s.%s as %s", badField.getStageName(), badField.getFieldName(), badField.getAlias()));
                    break;
                case OUTPUT_SCHEMA:
                    OutputSchemaError schemaError = (OutputSchemaError) error;
                    failure.withOutputSchemaField(schemaError.getField());
                    break;
                case DISTRIBUTION_SIZE:
                    failure.withConfigProperty(JoinerConfig.DISTRIBUTION_FACTOR);
                    break;
                case DISTRIBUTION_STAGE:
                    failure.withConfigProperty(JoinerConfig.DISTRIBUTION_STAGE);
                    break;
                case BROADCAST:
                    failure.withConfigProperty(JoinerConfig.MEMORY_INPUTS);
                    break;
                case INVALID_CONDITION:
                    failure.withConfigProperty(JoinerConfig.CONDITION_EXPR);
            }
        }
        throw collector.getOrThrowException();
    }
}
Also used : FieldOperation(io.cdap.cdap.etl.api.lineage.field.FieldOperation) Description(io.cdap.cdap.api.annotation.Description) BatchJoinerContext(io.cdap.cdap.etl.api.batch.BatchJoinerContext) LoggerFactory(org.slf4j.LoggerFactory) FailureCollector(io.cdap.cdap.etl.api.FailureCollector) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) Strings(com.google.common.base.Strings) Name(io.cdap.cdap.api.annotation.Name) Map(java.util.Map) InvalidJoinException(io.cdap.cdap.etl.api.join.InvalidJoinException) BatchAutoJoiner(io.cdap.cdap.etl.api.batch.BatchAutoJoiner) LinkedList(java.util.LinkedList) LinkedHashSet(java.util.LinkedHashSet) Nullable(javax.annotation.Nullable) BatchJoiner(io.cdap.cdap.etl.api.batch.BatchJoiner) JoinError(io.cdap.cdap.etl.api.join.error.JoinError) FieldTransformOperation(io.cdap.cdap.etl.api.lineage.field.FieldTransformOperation) Logger(org.slf4j.Logger) JoinField(io.cdap.cdap.etl.api.join.JoinField) JoinStage(io.cdap.cdap.etl.api.join.JoinStage) Set(java.util.Set) Plugin(io.cdap.cdap.api.annotation.Plugin) Collectors(java.util.stream.Collectors) SelectedFieldError(io.cdap.cdap.etl.api.join.error.SelectedFieldError) JoinKey(io.cdap.cdap.etl.api.join.JoinKey) Objects(java.util.Objects) List(java.util.List) ValidationFailure(io.cdap.cdap.etl.api.validation.ValidationFailure) OutputSchemaError(io.cdap.cdap.etl.api.join.error.OutputSchemaError) JoinDefinition(io.cdap.cdap.etl.api.join.JoinDefinition) AutoJoinerContext(io.cdap.cdap.etl.api.join.AutoJoinerContext) VisibleForTesting(com.google.common.annotations.VisibleForTesting) JoinCondition(io.cdap.cdap.etl.api.join.JoinCondition) Collections(java.util.Collections) JoinStage(io.cdap.cdap.etl.api.join.JoinStage) ArrayList(java.util.ArrayList) InvalidJoinException(io.cdap.cdap.etl.api.join.InvalidJoinException) JoinField(io.cdap.cdap.etl.api.join.JoinField) OutputSchemaError(io.cdap.cdap.etl.api.join.error.OutputSchemaError) JoinCondition(io.cdap.cdap.etl.api.join.JoinCondition) ValidationFailure(io.cdap.cdap.etl.api.validation.ValidationFailure) JoinError(io.cdap.cdap.etl.api.join.error.JoinError) JoinDefinition(io.cdap.cdap.etl.api.join.JoinDefinition) Objects(java.util.Objects) FailureCollector(io.cdap.cdap.etl.api.FailureCollector) Nullable(javax.annotation.Nullable)

Aggregations

JoinStage (io.cdap.cdap.etl.api.join.JoinStage)24 JoinDefinition (io.cdap.cdap.etl.api.join.JoinDefinition)14 HashMap (java.util.HashMap)13 Test (org.junit.Test)13 Schema (io.cdap.cdap.api.data.schema.Schema)11 JoinCondition (io.cdap.cdap.etl.api.join.JoinCondition)10 AutoJoinerContext (io.cdap.cdap.etl.api.join.AutoJoinerContext)8 SparkCollection (io.cdap.cdap.etl.spark.SparkCollection)8 Matchers.anyString (org.mockito.Matchers.anyString)8 FailureCollector (io.cdap.cdap.etl.api.FailureCollector)6 JoinField (io.cdap.cdap.etl.api.join.JoinField)5 MockFailureCollector (io.cdap.cdap.etl.mock.validation.MockFailureCollector)5 ArrayList (java.util.ArrayList)5 HashSet (java.util.HashSet)5 List (java.util.List)5 ValidationException (io.cdap.cdap.etl.api.validation.ValidationException)4 JoinCollection (io.cdap.cdap.etl.spark.join.JoinCollection)4 BatchAutoJoiner (io.cdap.cdap.etl.api.batch.BatchAutoJoiner)3 BatchJoiner (io.cdap.cdap.etl.api.batch.BatchJoiner)3 JoinKey (io.cdap.cdap.etl.api.join.JoinKey)3