Search in sources :

Example 16 with ETLPlugin

use of io.cdap.cdap.etl.proto.v2.ETLPlugin in project cdap by caskdata.

the class DataPipelineServiceTest method testValidateMultiInputInvalidInputField.

@Test
public void testValidateMultiInputInvalidInputField() throws Exception {
    // StringValueFilterTransform will be configured to filter records where field x has value 'y'
    // it will be invalid because the type of field x will be an int instead of the required string
    String stageName = "tx";
    Map<String, String> properties = new HashMap<>();
    properties.put("field", "x");
    properties.put("value", "y");
    ETLStage stage = new ETLStage(stageName, new ETLPlugin(StringValueFilterTransform.NAME, Transform.PLUGIN_TYPE, properties));
    Schema inputSchema = Schema.recordOf("x", Schema.Field.of("x", Schema.of(Schema.Type.INT)));
    StageValidationRequest requestBody = new StageValidationRequest(stage, ImmutableList.of(new StageSchema("input1", inputSchema), new StageSchema("input2", inputSchema)), false);
    StageValidationResponse actual = sendRequest(requestBody);
    List<String> expectedInputs = ImmutableList.of("input1", "input2");
    Assert.assertNull(actual.getSpec());
    Assert.assertEquals(1, actual.getFailures().size());
    ValidationFailure failure = actual.getFailures().iterator().next();
    // the stage will add 3 causes. Two are related to input field and one is related to config property.
    Assert.assertEquals(3, failure.getCauses().size());
    Assert.assertEquals("field", failure.getCauses().get(0).getAttribute(CauseAttributes.STAGE_CONFIG));
    Assert.assertEquals(stageName, failure.getCauses().get(0).getAttribute(STAGE));
    Assert.assertEquals(stageName, failure.getCauses().get(1).getAttribute(STAGE));
    Assert.assertEquals(stageName, failure.getCauses().get(2).getAttribute(STAGE));
    Assert.assertTrue(expectedInputs.contains(failure.getCauses().get(1).getAttribute(CauseAttributes.INPUT_STAGE)));
    Assert.assertTrue(expectedInputs.contains(failure.getCauses().get(2).getAttribute(CauseAttributes.INPUT_STAGE)));
}
Also used : StageSchema(io.cdap.cdap.etl.proto.v2.validation.StageSchema) StageValidationRequest(io.cdap.cdap.etl.proto.v2.validation.StageValidationRequest) HashMap(java.util.HashMap) ETLStage(io.cdap.cdap.etl.proto.v2.ETLStage) StageSchema(io.cdap.cdap.etl.proto.v2.validation.StageSchema) Schema(io.cdap.cdap.api.data.schema.Schema) ETLPlugin(io.cdap.cdap.etl.proto.v2.ETLPlugin) StageValidationResponse(io.cdap.cdap.etl.proto.v2.validation.StageValidationResponse) ValidationFailure(io.cdap.cdap.etl.api.validation.ValidationFailure) Test(org.junit.Test)

Example 17 with ETLPlugin

use of io.cdap.cdap.etl.proto.v2.ETLPlugin in project cdap by caskdata.

the class PipelineSpecGenerator method configureStage.

/**
 * Configures a plugin and returns the spec for it.
 *
 * @param stageName the unique plugin id
 * @param etlPlugin user provided configuration for the plugin
 * @param pipelineConfigurer default pipeline configurer to configure the plugin
 * @return the spec for the plugin
 * @throws IllegalArgumentException if the plugin with same id is already deployed
 * @throws ValidationException if the plugin threw an exception during configuration
 */
public StageSpec.Builder configureStage(String stageName, ETLPlugin etlPlugin, DefaultPipelineConfigurer pipelineConfigurer) throws ValidationException {
    TrackedPluginSelector pluginSelector = new TrackedPluginSelector(new ArtifactSelectorProvider().getPluginSelector(etlPlugin.getArtifactConfig()));
    String type = etlPlugin.getType();
    String pluginName = etlPlugin.getName();
    DefaultStageConfigurer stageConfigurer = pipelineConfigurer.getStageConfigurer();
    FailureCollector collector = stageConfigurer.getFailureCollector();
    Object plugin = getPlugin(stageName, etlPlugin, pluginSelector, type, pluginName, collector);
    try {
        if (type.equals(BatchJoiner.PLUGIN_TYPE)) {
            MultiInputPipelineConfigurable multiPlugin = (MultiInputPipelineConfigurable) plugin;
            multiPlugin.configurePipeline(pipelineConfigurer);
            // to the BatchAutoJoiner while preserving backwards compatibility in the pipeline config.
            if (plugin instanceof AutoJoiner) {
                configureAutoJoiner(stageName, (AutoJoiner) plugin, stageConfigurer, collector);
            }
        } else if (type.equals(SplitterTransform.PLUGIN_TYPE)) {
            MultiOutputPipelineConfigurable multiOutputPlugin = (MultiOutputPipelineConfigurable) plugin;
            multiOutputPlugin.configurePipeline(pipelineConfigurer);
        } else if (!type.equals(Constants.SPARK_PROGRAM_PLUGIN_TYPE)) {
            PipelineConfigurable singlePlugin = (PipelineConfigurable) plugin;
            singlePlugin.configurePipeline(pipelineConfigurer);
            // evaluate macros and find out if there is connection used
            if ((sourcePluginTypes.contains(type) || BatchSink.PLUGIN_TYPE.equals(type)) && runtimeEvaluator == null) {
                pluginConfigurer.evaluateMacros(etlPlugin.getProperties(), connectionEvaluator, options);
            }
        }
    } catch (InvalidConfigPropertyException e) {
        collector.addFailure(e.getMessage(), String.format("Provide valid value for config property '%s'.", e.getProperty())).withConfigProperty(e.getProperty());
    } catch (InvalidStageException e) {
        if (e.getReasons().isEmpty()) {
            collector.addFailure(e.getMessage(), null);
        }
        for (InvalidStageException reason : e.getReasons()) {
            if (reason instanceof InvalidConfigPropertyException) {
                InvalidConfigPropertyException configException = (InvalidConfigPropertyException) reason;
                collector.addFailure(configException.getMessage(), String.format("Provide valid value for config property '%s'.", configException.getProperty())).withConfigProperty(configException.getProperty());
            } else {
                collector.addFailure(reason.getMessage(), null);
            }
        }
    } catch (ValidationException e) {
        throw e;
    } catch (NullPointerException e) {
        // handle the case where plugin throws null pointer exception, this is to avoid having 'null' as error message
        collector.addFailure(String.format("Null error occurred while configuring the stage %s.", stageName), null).withStacktrace(e.getStackTrace());
    } catch (ArrayIndexOutOfBoundsException e) {
        // handle the case where plugin throws index out of bounds exception,
        // this is to avoid having a number like '2', '8' etc as error message
        collector.addFailure(String.format("Index out of bounds error occurred while configuring the stage %s.", stageName), null).withStacktrace(e.getStackTrace());
    } catch (ConnectionBadRequestException e) {
        collector.addFailure(e.getMessage(), "Provide a valid connection name.");
    } catch (Exception e) {
        collector.addFailure(String.format("Error encountered while configuring the stage: '%s'", e.getMessage()), null).withStacktrace(e.getStackTrace());
    }
    // throw validation exception if there are any errors being carried by failure collector
    collector.getOrThrowException();
    PluginSpec pluginSpec = new PluginSpec(type, pluginName, etlPlugin.getProperties(), pluginSelector.getSelectedArtifact());
    StageSpec.Builder specBuilder = StageSpec.builder(stageName, pluginSpec).addInputSchemas(pipelineConfigurer.getStageConfigurer().getInputSchemas()).setErrorSchema(stageConfigurer.getErrorSchema());
    if (type.equals(SplitterTransform.PLUGIN_TYPE)) {
        specBuilder.setPortSchemas(stageConfigurer.getOutputPortSchemas());
    } else {
        specBuilder.setOutputSchema(stageConfigurer.getOutputSchema());
    }
    return specBuilder;
}
Also used : ArtifactSelectorProvider(io.cdap.cdap.etl.common.ArtifactSelectorProvider) ValidationException(io.cdap.cdap.etl.api.validation.ValidationException) InvalidStageException(io.cdap.cdap.etl.api.validation.InvalidStageException) MultiOutputPipelineConfigurable(io.cdap.cdap.etl.api.MultiOutputPipelineConfigurable) DefaultStageConfigurer(io.cdap.cdap.etl.common.DefaultStageConfigurer) InvalidConfigPropertyException(io.cdap.cdap.etl.api.validation.InvalidConfigPropertyException) InvalidPluginConfigException(io.cdap.cdap.api.plugin.InvalidPluginConfigException) ValidationException(io.cdap.cdap.etl.api.validation.ValidationException) ConnectionBadRequestException(io.cdap.cdap.etl.proto.connection.ConnectionBadRequestException) InvalidStageException(io.cdap.cdap.etl.api.validation.InvalidStageException) ConnectionBadRequestException(io.cdap.cdap.etl.proto.connection.ConnectionBadRequestException) PluginSpec(io.cdap.cdap.etl.proto.v2.spec.PluginSpec) MultiInputPipelineConfigurable(io.cdap.cdap.etl.api.MultiInputPipelineConfigurable) StageSpec(io.cdap.cdap.etl.proto.v2.spec.StageSpec) InvalidConfigPropertyException(io.cdap.cdap.etl.api.validation.InvalidConfigPropertyException) AutoJoiner(io.cdap.cdap.etl.api.join.AutoJoiner) MultiInputPipelineConfigurable(io.cdap.cdap.etl.api.MultiInputPipelineConfigurable) MultiOutputPipelineConfigurable(io.cdap.cdap.etl.api.MultiOutputPipelineConfigurable) PipelineConfigurable(io.cdap.cdap.etl.api.PipelineConfigurable) FailureCollector(io.cdap.cdap.etl.api.FailureCollector)

Example 18 with ETLPlugin

use of io.cdap.cdap.etl.proto.v2.ETLPlugin in project cdap by caskdata.

the class PipelineTest method testWordCount.

public void testWordCount(String pluginType) throws Exception {
    String inputName = "wcInput-" + pluginType;
    String outputName = "wcOutput-" + pluginType;
    // create the pipeline config
    ETLStage source = new ETLStage("wcInput", MockSource.getPlugin(inputName));
    ETLStage sink = new ETLStage("wcOutput", MockSink.getPlugin(outputName));
    Map<String, String> aggProperties = new HashMap<>();
    aggProperties.put("field", "text");
    ETLStage agg = new ETLStage("middle", new ETLPlugin("WordCount", pluginType, aggProperties, null));
    ETLBatchConfig pipelineConfig = ETLBatchConfig.builder("* * * * *").addStage(source).addStage(sink).addStage(agg).addConnection(source.getName(), agg.getName()).addConnection(agg.getName(), sink.getName()).build();
    // create the pipeline
    ApplicationId pipelineId = NamespaceId.DEFAULT.app("wcTestPipeline-" + pluginType);
    ApplicationManager appManager = deployApplication(pipelineId, new AppRequest<>(APP_ARTIFACT, pipelineConfig));
    // write the input
    Schema inputSchema = Schema.recordOf("text", Schema.Field.of("text", Schema.of(Schema.Type.STRING)));
    DataSetManager<Table> inputManager = getDataset(inputName);
    List<StructuredRecord> inputRecords = new ArrayList<>();
    inputRecords.add(StructuredRecord.builder(inputSchema).set("text", "Hello World").build());
    inputRecords.add(StructuredRecord.builder(inputSchema).set("text", "Hello my name is Hal").build());
    inputRecords.add(StructuredRecord.builder(inputSchema).set("text", "Hello my name is Sam").build());
    MockSource.writeInput(inputManager, inputRecords);
    WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
    workflowManager.start();
    workflowManager.waitForRun(ProgramRunStatus.COMPLETED, 4, TimeUnit.MINUTES);
    DataSetManager<Table> outputManager = getDataset(outputName);
    Set<StructuredRecord> outputRecords = new HashSet<>();
    outputRecords.addAll(MockSink.readOutput(outputManager));
    Set<StructuredRecord> expected = new HashSet<>();
    expected.add(StructuredRecord.builder(WordCountAggregator.OUTPUT_SCHEMA).set("word", "Hello").set("count", 3L).build());
    expected.add(StructuredRecord.builder(WordCountAggregator.OUTPUT_SCHEMA).set("word", "World").set("count", 1L).build());
    expected.add(StructuredRecord.builder(WordCountAggregator.OUTPUT_SCHEMA).set("word", "my").set("count", 2L).build());
    expected.add(StructuredRecord.builder(WordCountAggregator.OUTPUT_SCHEMA).set("word", "name").set("count", 2L).build());
    expected.add(StructuredRecord.builder(WordCountAggregator.OUTPUT_SCHEMA).set("word", "is").set("count", 2L).build());
    expected.add(StructuredRecord.builder(WordCountAggregator.OUTPUT_SCHEMA).set("word", "Hal").set("count", 1L).build());
    expected.add(StructuredRecord.builder(WordCountAggregator.OUTPUT_SCHEMA).set("word", "Sam").set("count", 1L).build());
    Assert.assertEquals(expected, outputRecords);
}
Also used : ApplicationManager(io.cdap.cdap.test.ApplicationManager) KeyValueTable(io.cdap.cdap.api.dataset.lib.KeyValueTable) Table(io.cdap.cdap.api.dataset.table.Table) HashMap(java.util.HashMap) Schema(io.cdap.cdap.api.data.schema.Schema) WorkflowManager(io.cdap.cdap.test.WorkflowManager) ArrayList(java.util.ArrayList) ETLPlugin(io.cdap.cdap.etl.proto.v2.ETLPlugin) StructuredRecord(io.cdap.cdap.api.data.format.StructuredRecord) ETLBatchConfig(io.cdap.cdap.etl.proto.v2.ETLBatchConfig) ETLStage(io.cdap.cdap.etl.proto.v2.ETLStage) ApplicationId(io.cdap.cdap.proto.id.ApplicationId) HashSet(java.util.HashSet)

Example 19 with ETLPlugin

use of io.cdap.cdap.etl.proto.v2.ETLPlugin in project cdap by cdapio.

the class PipelineSpecGenerator method configureStage.

/**
 * Configures a stage and returns the spec for it.
 *
 * @param stage the user provided configuration for the stage
 * @param validatedPipeline the validated pipeline config
 * @param pluginConfigurer configurer used to configure the stage
 * @return the spec for the stage
 * @throws ValidationException if the plugin threw an exception during configuration
 */
protected ConfiguredStage configureStage(ETLStage stage, ValidatedPipeline validatedPipeline, DefaultPipelineConfigurer pluginConfigurer) throws ValidationException {
    String stageName = stage.getName();
    ETLPlugin stagePlugin = stage.getPlugin();
    StageSpec.Builder specBuilder = configureStage(stageName, stagePlugin, pluginConfigurer);
    DefaultStageConfigurer stageConfigurer = pluginConfigurer.getStageConfigurer();
    String pluginType = stage.getPlugin().getType();
    if (pluginType.equals(SplitterTransform.PLUGIN_TYPE)) {
        Map<String, Schema> outputPortSchemas = stageConfigurer.getOutputPortSchemas();
        for (Map.Entry<String, String> outputEntry : validatedPipeline.getOutputPorts(stageName).entrySet()) {
            String outputStage = outputEntry.getKey();
            String outputPort = outputEntry.getValue();
            if (outputPort == null) {
                throw new IllegalArgumentException(String.format("Connection from Splitter '%s' to '%s' must specify a port.", stageName, outputStage));
            }
            specBuilder.addOutput(outputStage, outputPort, outputPortSchemas.get(outputPort));
        }
    } else {
        Schema outputSchema = stageConfigurer.getOutputSchema();
        // all the same
        if (Condition.PLUGIN_TYPE.equals(pluginType)) {
            outputSchema = null;
            for (Schema schema : stageConfigurer.getInputSchemas().values()) {
                if (schema != null) {
                    // todo: fix this cleanly and fully
                    if (outputSchema != null && !Schemas.equalsIgnoringRecordName(outputSchema, schema)) {
                        throw new IllegalArgumentException("Cannot have different input schemas going into stage " + stageName);
                    }
                    outputSchema = schema;
                }
            }
        }
        for (String outputStage : validatedPipeline.getOutputs(stageName)) {
            specBuilder.addOutput(outputStage, null, outputSchema);
        }
    }
    StageSpec stageSpec = specBuilder.setProcessTimingEnabled(validatedPipeline.isProcessTimingEnabled()).setStageLoggingEnabled(validatedPipeline.isStageLoggingEnabled()).setMaxPreviewRecords(validatedPipeline.getMaxPreviewRecords()).build();
    return new ConfiguredStage(stageSpec, pluginConfigurer.getPipelineProperties());
}
Also used : StageSpec(io.cdap.cdap.etl.proto.v2.spec.StageSpec) Schema(io.cdap.cdap.api.data.schema.Schema) ETLPlugin(io.cdap.cdap.etl.proto.v2.ETLPlugin) DefaultStageConfigurer(io.cdap.cdap.etl.common.DefaultStageConfigurer) Map(java.util.Map) HashMap(java.util.HashMap)

Example 20 with ETLPlugin

use of io.cdap.cdap.etl.proto.v2.ETLPlugin in project cdap by cdapio.

the class PipelineTest method testTextFileSinkAndDeletePostAction.

@Test
public void testTextFileSinkAndDeletePostAction() throws Exception {
    // create the pipeline config
    String inputName = "sinkTestInput";
    String outputName = "sinkTestOutput";
    String outputDirName = "users";
    ETLStage source = new ETLStage("source", MockSource.getPlugin(inputName));
    Map<String, String> sinkProperties = new HashMap<>();
    sinkProperties.put(TextFileSetSink.Conf.FILESET_NAME, outputName);
    sinkProperties.put(TextFileSetSink.Conf.FIELD_SEPARATOR, "|");
    sinkProperties.put(TextFileSetSink.Conf.OUTPUT_DIR, "${dir}");
    ETLStage sink = new ETLStage("sink", new ETLPlugin(TextFileSetSink.NAME, BatchSink.PLUGIN_TYPE, sinkProperties, null));
    Map<String, String> actionProperties = new HashMap<>();
    actionProperties.put(FilesetDeletePostAction.Conf.FILESET_NAME, outputName);
    // mapreduce writes multiple files to the output directory. Along with the actual output,
    // there are various .crc files that do not contain any of the output content.
    actionProperties.put(FilesetDeletePostAction.Conf.DELETE_REGEX, ".*\\.crc|_SUCCESS");
    actionProperties.put(FilesetDeletePostAction.Conf.DIRECTORY, outputDirName);
    ETLStage postAction = new ETLStage("cleanup", new ETLPlugin(FilesetDeletePostAction.NAME, PostAction.PLUGIN_TYPE, actionProperties, null));
    ETLBatchConfig pipelineConfig = ETLBatchConfig.builder("* * * * *").addStage(source).addStage(sink).addPostAction(postAction).addConnection(source.getName(), sink.getName()).build();
    // create the pipeline
    ApplicationId pipelineId = NamespaceId.DEFAULT.app("textSinkTestPipeline");
    ApplicationManager appManager = deployApplication(pipelineId, new AppRequest<>(APP_ARTIFACT, pipelineConfig));
    // write some data to the input fileset
    Schema inputSchema = Schema.recordOf("test", Schema.Field.of("name", Schema.of(Schema.Type.STRING)), Schema.Field.of("item", Schema.of(Schema.Type.STRING)));
    Map<String, String> users = new HashMap<>();
    users.put("samuel", "wallet");
    users.put("dwayne", "rock");
    users.put("christopher", "cowbell");
    List<StructuredRecord> inputRecords = new ArrayList<>();
    for (Map.Entry<String, String> userEntry : users.entrySet()) {
        String name = userEntry.getKey();
        String item = userEntry.getValue();
        inputRecords.add(StructuredRecord.builder(inputSchema).set("name", name).set("item", item).build());
    }
    DataSetManager<Table> inputManager = getDataset(inputName);
    MockSource.writeInput(inputManager, inputRecords);
    // run the pipeline
    Map<String, String> runtimeArgs = new HashMap<>();
    // the ${dir} macro will be substituted with "users" for our pipeline run
    runtimeArgs.put("dir", outputDirName);
    WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
    workflowManager.start(runtimeArgs);
    workflowManager.waitForRun(ProgramRunStatus.COMPLETED, 4, TimeUnit.MINUTES);
    // check the pipeline output
    DataSetManager<FileSet> outputManager = getDataset(outputName);
    FileSet output = outputManager.get();
    Location outputDir = output.getBaseLocation().append(outputDirName);
    Map<String, String> actual = new HashMap<>();
    for (Location outputFile : outputDir.list()) {
        if (outputFile.getName().endsWith(".crc") || "_SUCCESS".equals(outputFile.getName())) {
            Assert.fail("Post action did not delete file " + outputFile.getName());
        }
        try (BufferedReader reader = new BufferedReader(new InputStreamReader(outputFile.getInputStream()))) {
            String line;
            while ((line = reader.readLine()) != null) {
                String[] parts = line.split("\\|");
                actual.put(parts[0], parts[1]);
            }
        }
    }
    Assert.assertEquals(actual, users);
}
Also used : ApplicationManager(io.cdap.cdap.test.ApplicationManager) KeyValueTable(io.cdap.cdap.api.dataset.lib.KeyValueTable) Table(io.cdap.cdap.api.dataset.table.Table) FileSet(io.cdap.cdap.api.dataset.lib.FileSet) InputStreamReader(java.io.InputStreamReader) HashMap(java.util.HashMap) Schema(io.cdap.cdap.api.data.schema.Schema) WorkflowManager(io.cdap.cdap.test.WorkflowManager) ArrayList(java.util.ArrayList) ETLPlugin(io.cdap.cdap.etl.proto.v2.ETLPlugin) StructuredRecord(io.cdap.cdap.api.data.format.StructuredRecord) ETLBatchConfig(io.cdap.cdap.etl.proto.v2.ETLBatchConfig) ETLStage(io.cdap.cdap.etl.proto.v2.ETLStage) BufferedReader(java.io.BufferedReader) ApplicationId(io.cdap.cdap.proto.id.ApplicationId) HashMap(java.util.HashMap) Map(java.util.Map) Location(org.apache.twill.filesystem.Location) Test(org.junit.Test)

Aggregations

ETLPlugin (io.cdap.cdap.etl.proto.v2.ETLPlugin)154 ETLStage (io.cdap.cdap.etl.proto.v2.ETLStage)144 Test (org.junit.Test)125 ApplicationManager (io.cdap.cdap.test.ApplicationManager)102 ETLBatchConfig (io.cdap.cdap.etl.proto.v2.ETLBatchConfig)101 Table (io.cdap.cdap.api.dataset.table.Table)79 StructuredRecord (io.cdap.cdap.api.data.format.StructuredRecord)77 ApplicationId (io.cdap.cdap.proto.id.ApplicationId)70 AppRequest (io.cdap.cdap.proto.artifact.AppRequest)59 Schema (io.cdap.cdap.api.data.schema.Schema)55 HashMap (java.util.HashMap)55 WorkflowManager (io.cdap.cdap.test.WorkflowManager)53 ImmutableMap (com.google.common.collect.ImmutableMap)36 KeyValueTable (io.cdap.cdap.api.dataset.lib.KeyValueTable)35 HashSet (java.util.HashSet)35 ArrayList (java.util.ArrayList)34 StageValidationResponse (io.cdap.cdap.etl.proto.v2.validation.StageValidationResponse)20 StageValidationRequest (io.cdap.cdap.etl.proto.v2.validation.StageValidationRequest)18 File (java.io.File)17 FileSet (io.cdap.cdap.api.dataset.lib.FileSet)14