Search in sources :

Example 11 with ETLPlugin

use of io.cdap.cdap.etl.proto.v2.ETLPlugin in project cdap by caskdata.

the class ValidationUtils method validate.

/**
 * Validate plugin based on the {@link StageValidationRequest}
 *
 * @param validationRequest {@link StageValidationRequest} with plugin properties
 * @param pluginConfigurer  {@link PluginConfigurer} for using the plugin
 * @param macroFn           {@link Function} for evaluating macros
 * @return {@link StageValidationResponse} in json format
 */
public static StageValidationResponse validate(String namespace, StageValidationRequest validationRequest, PluginConfigurer pluginConfigurer, Function<Map<String, String>, Map<String, String>> macroFn, FeatureFlagsProvider featureFlagsProvider) {
    ETLStage stageConfig = validationRequest.getStage();
    ValidatingConfigurer validatingConfigurer = new ValidatingConfigurer(pluginConfigurer, featureFlagsProvider);
    // Batch or Streaming doesn't matter for a single stage.
    PipelineSpecGenerator<ETLBatchConfig, BatchPipelineSpec> pipelineSpecGenerator = new BatchPipelineSpecGenerator(namespace, validatingConfigurer, null, Collections.emptySet(), Collections.emptySet(), Engine.SPARK, featureFlagsProvider);
    DefaultStageConfigurer stageConfigurer = new DefaultStageConfigurer(stageConfig.getName());
    for (StageSchema stageSchema : validationRequest.getInputSchemas()) {
        stageConfigurer.addInputSchema(stageSchema.getStage(), stageSchema.getSchema());
        stageConfigurer.addInputStage(stageSchema.getStage());
    }
    DefaultPipelineConfigurer pipelineConfigurer = new DefaultPipelineConfigurer(validatingConfigurer, stageConfig.getName(), Engine.SPARK, stageConfigurer, featureFlagsProvider);
    // evaluate macros
    Map<String, String> evaluatedProperties = macroFn.apply(stageConfig.getPlugin().getProperties());
    ETLPlugin originalConfig = stageConfig.getPlugin();
    ETLPlugin evaluatedConfig = new ETLPlugin(originalConfig.getName(), originalConfig.getType(), evaluatedProperties, originalConfig.getArtifactConfig());
    try {
        StageSpec spec = pipelineSpecGenerator.configureStage(stageConfig.getName(), evaluatedConfig, pipelineConfigurer).build();
        return new StageValidationResponse(spec);
    } catch (ValidationException e) {
        return new StageValidationResponse(e.getFailures());
    }
}
Also used : ValidationException(io.cdap.cdap.etl.api.validation.ValidationException) BatchPipelineSpecGenerator(io.cdap.cdap.etl.batch.BatchPipelineSpecGenerator) ETLPlugin(io.cdap.cdap.etl.proto.v2.ETLPlugin) DefaultStageConfigurer(io.cdap.cdap.etl.common.DefaultStageConfigurer) ETLBatchConfig(io.cdap.cdap.etl.proto.v2.ETLBatchConfig) BatchPipelineSpec(io.cdap.cdap.etl.batch.BatchPipelineSpec) StageSchema(io.cdap.cdap.etl.proto.v2.validation.StageSchema) ETLStage(io.cdap.cdap.etl.proto.v2.ETLStage) ValidatingConfigurer(io.cdap.cdap.etl.validation.ValidatingConfigurer) StageSpec(io.cdap.cdap.etl.proto.v2.spec.StageSpec) DefaultPipelineConfigurer(io.cdap.cdap.etl.common.DefaultPipelineConfigurer) StageValidationResponse(io.cdap.cdap.etl.proto.v2.validation.StageValidationResponse)

Example 12 with ETLPlugin

use of io.cdap.cdap.etl.proto.v2.ETLPlugin in project cdap by caskdata.

the class AutoJoinerTest method testInnerBetweenCondition.

@Test
public void testInnerBetweenCondition() throws Exception {
    /*
         users ----------|
                         |--> join --> sink
         age_groups -----|

         joinOn: users.age > age_groups.lo and (users.age <= age_groups.hi or age_groups.hi is null)
     */
    Schema userSchema = Schema.recordOf("user", Schema.Field.of("name", Schema.of(Schema.Type.STRING)), Schema.Field.of("age", Schema.nullableOf(Schema.of(Schema.Type.INT))));
    Schema ageGroupSchema = Schema.recordOf("age_group", Schema.Field.of("name", Schema.of(Schema.Type.STRING)), Schema.Field.of("lo", Schema.of(Schema.Type.INT)), Schema.Field.of("hi", Schema.nullableOf(Schema.of(Schema.Type.INT))));
    Schema expectedSchema = Schema.recordOf("users.age_groups", Schema.Field.of("username", Schema.of(Schema.Type.STRING)), Schema.Field.of("age_group", Schema.of(Schema.Type.STRING)));
    String userInput = UUID.randomUUID().toString();
    String agesInput = UUID.randomUUID().toString();
    String output = UUID.randomUUID().toString();
    List<JoinField> select = new ArrayList<>();
    select.add(new JoinField("users", "name", "username"));
    select.add(new JoinField("age_groups", "name", "age_group"));
    JoinCondition.OnExpression condition = JoinCondition.onExpression().setExpression("users.age >= age_groups.lo and (users.age < age_groups.hi or age_groups.hi is null)").build();
    Map<String, String> joinerProperties = MockAutoJoiner.getProperties(Arrays.asList("users", "age_groups"), Collections.emptyList(), Arrays.asList("users", "age_groups"), Collections.emptyList(), select, false, null, condition);
    ETLBatchConfig config = ETLBatchConfig.builder().addStage(new ETLStage("users", MockSource.getPlugin(userInput, userSchema))).addStage(new ETLStage("age_groups", MockSource.getPlugin(agesInput, ageGroupSchema))).addStage(new ETLStage("join", new ETLPlugin(MockAutoJoiner.NAME, BatchJoiner.PLUGIN_TYPE, joinerProperties))).addStage(new ETLStage("sink", MockSink.getPlugin(output))).addConnection("users", "join").addConnection("age_groups", "join").addConnection("join", "sink").setEngine(Engine.SPARK).build();
    AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, config);
    ApplicationId appId = NamespaceId.DEFAULT.app(UUID.randomUUID().toString());
    ApplicationManager appManager = deployApplication(appId, appRequest);
    List<StructuredRecord> records = new ArrayList<>();
    records.add(StructuredRecord.builder(userSchema).set("name", "Alice").set("age", 35).build());
    records.add(StructuredRecord.builder(userSchema).set("name", "Bob").build());
    records.add(StructuredRecord.builder(userSchema).set("name", "Carl").set("age", 13).build());
    records.add(StructuredRecord.builder(userSchema).set("name", "Dave").set("age", 0).build());
    records.add(StructuredRecord.builder(userSchema).set("name", "Elaine").set("age", 68).build());
    records.add(StructuredRecord.builder(userSchema).set("name", "Fred").set("age", 4).build());
    DataSetManager<Table> inputManager = getDataset(userInput);
    MockSource.writeInput(inputManager, records);
    records.clear();
    records.add(StructuredRecord.builder(ageGroupSchema).set("name", "infant").set("lo", 0).set("hi", 2).build());
    records.add(StructuredRecord.builder(ageGroupSchema).set("name", "toddler").set("lo", 2).set("hi", 5).build());
    records.add(StructuredRecord.builder(ageGroupSchema).set("name", "child").set("lo", 5).set("hi", 13).build());
    records.add(StructuredRecord.builder(ageGroupSchema).set("name", "teen").set("lo", 13).set("hi", 20).build());
    records.add(StructuredRecord.builder(ageGroupSchema).set("name", "adult").set("lo", 20).set("hi", 65).build());
    records.add(StructuredRecord.builder(ageGroupSchema).set("name", "senior").set("lo", 65).build());
    inputManager = getDataset(agesInput);
    MockSource.writeInput(inputManager, records);
    WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
    workflowManager.startAndWaitForGoodRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
    DataSetManager<Table> outputManager = getDataset(output);
    List<StructuredRecord> outputRecords = MockSink.readOutput(outputManager);
    Set<StructuredRecord> expected = new HashSet<>();
    expected.add(StructuredRecord.builder(expectedSchema).set("username", "Alice").set("age_group", "adult").build());
    expected.add(StructuredRecord.builder(expectedSchema).set("username", "Carl").set("age_group", "teen").build());
    expected.add(StructuredRecord.builder(expectedSchema).set("username", "Dave").set("age_group", "infant").build());
    expected.add(StructuredRecord.builder(expectedSchema).set("username", "Elaine").set("age_group", "senior").build());
    expected.add(StructuredRecord.builder(expectedSchema).set("username", "Fred").set("age_group", "toddler").build());
    Assert.assertEquals(expected, new HashSet<>(outputRecords));
    validateMetric(6, appId, "users.records.out");
    validateMetric(6, appId, "age_groups.records.out");
    validateMetric(12, appId, "join.records.in");
    validateMetric(expected.size(), appId, "join.records.out");
}
Also used : ApplicationManager(io.cdap.cdap.test.ApplicationManager) Table(io.cdap.cdap.api.dataset.table.Table) Schema(io.cdap.cdap.api.data.schema.Schema) WorkflowManager(io.cdap.cdap.test.WorkflowManager) ArrayList(java.util.ArrayList) JoinField(io.cdap.cdap.etl.api.join.JoinField) ETLPlugin(io.cdap.cdap.etl.proto.v2.ETLPlugin) StructuredRecord(io.cdap.cdap.api.data.format.StructuredRecord) JoinCondition(io.cdap.cdap.etl.api.join.JoinCondition) AppRequest(io.cdap.cdap.proto.artifact.AppRequest) ETLBatchConfig(io.cdap.cdap.etl.proto.v2.ETLBatchConfig) ETLStage(io.cdap.cdap.etl.proto.v2.ETLStage) ApplicationId(io.cdap.cdap.proto.id.ApplicationId) HashSet(java.util.HashSet) Test(org.junit.Test)

Example 13 with ETLPlugin

use of io.cdap.cdap.etl.proto.v2.ETLPlugin in project cdap by caskdata.

the class AutoJoinerTest method testLeftOuterComplexConditionBroadcast.

@Test
public void testLeftOuterComplexConditionBroadcast() throws Exception {
    /*
         sales ----------|
                         |--> join --> sink
         categories -----|

         joinOn:
           sales.price > 1000 and sales.date > 2020-01-01 and
           (sales.category <=> categories.id or (sales.category is null and sales.department = categories.department))
     */
    Schema salesSchema = Schema.recordOf("sale", Schema.Field.of("id", Schema.of(Schema.Type.INT)), Schema.Field.of("price", Schema.of(Schema.Type.DOUBLE)), Schema.Field.of("date", Schema.of(Schema.LogicalType.DATETIME)), Schema.Field.of("category", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("department", Schema.nullableOf(Schema.of(Schema.Type.STRING))));
    Schema categorySchema = Schema.recordOf("category", Schema.Field.of("id", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("department", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("flag", Schema.nullableOf(Schema.of(Schema.Type.BOOLEAN))));
    Schema expectedSchema = Schema.recordOf("sales.categories", Schema.Field.of("id", Schema.of(Schema.Type.INT)), Schema.Field.of("flag", Schema.nullableOf(Schema.of(Schema.Type.BOOLEAN))));
    String salesInput = UUID.randomUUID().toString();
    String categoriesInput = UUID.randomUUID().toString();
    String output = UUID.randomUUID().toString();
    List<JoinField> select = new ArrayList<>();
    select.add(new JoinField("sales", "id"));
    select.add(new JoinField("categories", "flag"));
    /*
           sales.price > 1000 and sales.date > 2020-01-01 and
           (sales.category <=> categories.id or (sales.category is null and sales.department = categories.department))
     */
    JoinCondition.OnExpression condition = JoinCondition.onExpression().addDatasetAlias("sales", "S").addDatasetAlias("categories", "C").setExpression("S.price > 1000 and S.date > '2020-01-01 00:00:00' and " + "(S.category = C.id or (S.category is null and S.department = C.department))").build();
    Map<String, String> joinerProperties = MockAutoJoiner.getProperties(Arrays.asList("sales", "categories"), Collections.emptyList(), Collections.singletonList("sales"), Collections.singletonList("categories"), select, false, null, condition);
    ETLBatchConfig config = ETLBatchConfig.builder().addStage(new ETLStage("sales", MockSource.getPlugin(salesInput, salesSchema))).addStage(new ETLStage("categories", MockSource.getPlugin(categoriesInput, categorySchema))).addStage(new ETLStage("join", new ETLPlugin(MockAutoJoiner.NAME, BatchJoiner.PLUGIN_TYPE, joinerProperties))).addStage(new ETLStage("sink", MockSink.getPlugin(output))).addConnection("sales", "join").addConnection("categories", "join").addConnection("join", "sink").setEngine(Engine.SPARK).build();
    AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, config);
    ApplicationId appId = NamespaceId.DEFAULT.app(UUID.randomUUID().toString());
    ApplicationManager appManager = deployApplication(appId, appRequest);
    List<StructuredRecord> records = new ArrayList<>();
    records.add(StructuredRecord.builder(salesSchema).set("id", 0).set("price", 123.45d).set("date", "2021-01-01 00:00:00").set("category", "electronics").set("department", "entertainment").build());
    records.add(StructuredRecord.builder(salesSchema).set("id", 1).set("price", 1000.01d).set("date", "2020-01-01 00:00:01").set("department", "home").build());
    records.add(StructuredRecord.builder(salesSchema).set("id", 2).set("price", 5000d).set("date", "2021-01-01 00:00:00").set("category", "furniture").build());
    records.add(StructuredRecord.builder(salesSchema).set("id", 3).set("price", 2000d).set("date", "2019-12-31 23:59:59").set("category", "furniture").build());
    records.add(StructuredRecord.builder(salesSchema).set("id", 4).set("price", 2000d).set("date", "2020-01-01 12:00:00").set("category", "tv").set("department", "entertainment").build());
    DataSetManager<Table> inputManager = getDataset(salesInput);
    MockSource.writeInput(inputManager, records);
    records.clear();
    records.add(StructuredRecord.builder(categorySchema).set("id", "electronics").set("department", "entertainment").set("flag", false).build());
    records.add(StructuredRecord.builder(categorySchema).set("id", "furniture").set("department", "home").set("flag", true).build());
    records.add(StructuredRecord.builder(categorySchema).set("id", "tv").set("department", "entertainment").set("flag", false).build());
    inputManager = getDataset(categoriesInput);
    MockSource.writeInput(inputManager, records);
    WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
    workflowManager.startAndWaitForGoodRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
    DataSetManager<Table> outputManager = getDataset(output);
    List<StructuredRecord> outputRecords = MockSink.readOutput(outputManager);
    Set<StructuredRecord> expected = new HashSet<>();
    expected.add(StructuredRecord.builder(expectedSchema).set("id", 0).build());
    expected.add(StructuredRecord.builder(expectedSchema).set("id", 1).set("flag", true).build());
    expected.add(StructuredRecord.builder(expectedSchema).set("id", 2).set("flag", true).build());
    expected.add(StructuredRecord.builder(expectedSchema).set("id", 3).build());
    expected.add(StructuredRecord.builder(expectedSchema).set("id", 4).set("flag", false).build());
    Assert.assertEquals(expected, new HashSet<>(outputRecords));
}
Also used : ApplicationManager(io.cdap.cdap.test.ApplicationManager) Table(io.cdap.cdap.api.dataset.table.Table) Schema(io.cdap.cdap.api.data.schema.Schema) WorkflowManager(io.cdap.cdap.test.WorkflowManager) ArrayList(java.util.ArrayList) JoinField(io.cdap.cdap.etl.api.join.JoinField) ETLPlugin(io.cdap.cdap.etl.proto.v2.ETLPlugin) StructuredRecord(io.cdap.cdap.api.data.format.StructuredRecord) JoinCondition(io.cdap.cdap.etl.api.join.JoinCondition) AppRequest(io.cdap.cdap.proto.artifact.AppRequest) ETLBatchConfig(io.cdap.cdap.etl.proto.v2.ETLBatchConfig) ETLStage(io.cdap.cdap.etl.proto.v2.ETLStage) ApplicationId(io.cdap.cdap.proto.id.ApplicationId) HashSet(java.util.HashSet) Test(org.junit.Test)

Example 14 with ETLPlugin

use of io.cdap.cdap.etl.proto.v2.ETLPlugin in project cdap by caskdata.

the class DataPipelineServiceTest method testValidateStageSingleInvalidConfigProperty.

@Test
public void testValidateStageSingleInvalidConfigProperty() throws Exception {
    // StringValueFilterTransform will be configured to filter records where field x has value 'y'
    // it will be invalid because the type of field x will be an int instead of the required string
    String stageName = "tx";
    Map<String, String> properties = new HashMap<>();
    properties.put("field", "x");
    properties.put("value", "y");
    ETLStage stage = new ETLStage(stageName, new ETLPlugin(StringValueFilterTransform.NAME, Transform.PLUGIN_TYPE, properties));
    Schema inputSchema = Schema.recordOf("x", Schema.Field.of("x", Schema.of(Schema.Type.INT)));
    StageValidationRequest requestBody = new StageValidationRequest(stage, Collections.singletonList(new StageSchema("input", inputSchema)), false);
    StageValidationResponse actual = sendRequest(requestBody);
    Assert.assertNull(actual.getSpec());
    Assert.assertEquals(1, actual.getFailures().size());
    ValidationFailure failure = actual.getFailures().iterator().next();
    // the stage will add 2 causes for invalid input field failure. One is related to input field and the other is
    // related to config property.
    Assert.assertEquals(2, failure.getCauses().size());
    Assert.assertEquals("field", failure.getCauses().get(0).getAttribute(CauseAttributes.STAGE_CONFIG));
    Assert.assertEquals(stageName, failure.getCauses().get(0).getAttribute(STAGE));
    Assert.assertEquals("x", failure.getCauses().get(1).getAttribute(CauseAttributes.INPUT_SCHEMA_FIELD));
    Assert.assertEquals("input", failure.getCauses().get(1).getAttribute(CauseAttributes.INPUT_STAGE));
    Assert.assertEquals(stageName, failure.getCauses().get(1).getAttribute(STAGE));
}
Also used : StageSchema(io.cdap.cdap.etl.proto.v2.validation.StageSchema) StageValidationRequest(io.cdap.cdap.etl.proto.v2.validation.StageValidationRequest) HashMap(java.util.HashMap) ETLStage(io.cdap.cdap.etl.proto.v2.ETLStage) StageSchema(io.cdap.cdap.etl.proto.v2.validation.StageSchema) Schema(io.cdap.cdap.api.data.schema.Schema) ETLPlugin(io.cdap.cdap.etl.proto.v2.ETLPlugin) StageValidationResponse(io.cdap.cdap.etl.proto.v2.validation.StageValidationResponse) ValidationFailure(io.cdap.cdap.etl.api.validation.ValidationFailure) Test(org.junit.Test)

Example 15 with ETLPlugin

use of io.cdap.cdap.etl.proto.v2.ETLPlugin in project cdap by caskdata.

the class DataPipelineServiceTest method testValidateStageMissingRequiredProperty.

// tests that plugins that cannot be instantiated due to missing required properties are captured
@Test
public void testValidateStageMissingRequiredProperty() throws Exception {
    String stageName = "tx";
    // string filter requires the field name and the value
    ETLStage stage = new ETLStage(stageName, new ETLPlugin(StringValueFilterTransform.NAME, Transform.PLUGIN_TYPE, Collections.emptyMap()));
    StageValidationResponse actual = sendRequest(new StageValidationRequest(stage, Collections.emptyList(), false));
    Assert.assertNull(actual.getSpec());
    Assert.assertEquals(2, actual.getFailures().size());
    Set<String> properties = new HashSet<>();
    properties.add(actual.getFailures().get(0).getCauses().get(0).getAttribute(CauseAttributes.STAGE_CONFIG));
    properties.add(actual.getFailures().get(1).getCauses().get(0).getAttribute(CauseAttributes.STAGE_CONFIG));
    Set<String> expected = new HashSet<>();
    expected.add("field");
    expected.add("value");
    Assert.assertEquals(expected, properties);
}
Also used : StageValidationRequest(io.cdap.cdap.etl.proto.v2.validation.StageValidationRequest) ETLStage(io.cdap.cdap.etl.proto.v2.ETLStage) ETLPlugin(io.cdap.cdap.etl.proto.v2.ETLPlugin) StageValidationResponse(io.cdap.cdap.etl.proto.v2.validation.StageValidationResponse) HashSet(java.util.HashSet) Test(org.junit.Test)

Aggregations

ETLPlugin (io.cdap.cdap.etl.proto.v2.ETLPlugin)31 ETLStage (io.cdap.cdap.etl.proto.v2.ETLStage)29 Test (org.junit.Test)26 ETLBatchConfig (io.cdap.cdap.etl.proto.v2.ETLBatchConfig)20 HashMap (java.util.HashMap)20 ArrayList (java.util.ArrayList)18 StructuredRecord (io.cdap.cdap.api.data.format.StructuredRecord)17 Schema (io.cdap.cdap.api.data.schema.Schema)17 ApplicationId (io.cdap.cdap.proto.id.ApplicationId)17 Table (io.cdap.cdap.api.dataset.table.Table)16 ApplicationManager (io.cdap.cdap.test.ApplicationManager)16 WorkflowManager (io.cdap.cdap.test.WorkflowManager)16 HashSet (java.util.HashSet)13 AppRequest (io.cdap.cdap.proto.artifact.AppRequest)12 KeyValueTable (io.cdap.cdap.api.dataset.lib.KeyValueTable)11 StageValidationResponse (io.cdap.cdap.etl.proto.v2.validation.StageValidationResponse)10 StageValidationRequest (io.cdap.cdap.etl.proto.v2.validation.StageValidationRequest)9 FileSet (io.cdap.cdap.api.dataset.lib.FileSet)7 ETLPlugin (co.cask.cdap.etl.proto.v2.ETLPlugin)6 SpamMessage (io.cdap.cdap.datapipeline.mock.SpamMessage)6