Search in sources :

Example 61 with ETLStage

use of co.cask.cdap.etl.proto.v2.ETLStage in project cdap by caskdata.

the class PipelineSpecGeneratorTest method testNestedConditionConnectionWithMultipleTrueBranches.

@Test(expected = IllegalArgumentException.class)
public void testNestedConditionConnectionWithMultipleTrueBranches() {
    ETLBatchConfig etlConfig = ETLBatchConfig.builder("* * * * *").addStage(new ETLStage("source", MOCK_SOURCE)).addStage(new ETLStage("condition1", MOCK_CONDITION)).addStage(new ETLStage("condition2", MOCK_CONDITION)).addStage(new ETLStage("t1", MOCK_TRANSFORM_A)).addStage(new ETLStage("t11", MOCK_TRANSFORM_A)).addStage(new ETLStage("t12", MOCK_TRANSFORM_A)).addStage(new ETLStage("t2", MOCK_TRANSFORM_B)).addStage(new ETLStage("sink1", MOCK_SINK)).addStage(new ETLStage("sink2", MOCK_SINK)).addConnection("source", "condition1").addConnection("condition1", "t1", true).addConnection("t1", "condition2").addConnection("condition2", "t11", false).addConnection("condition2", "t12", false).addConnection("condition1", "t2", false).addConnection("t11", "sink1").addConnection("t12", "sink1").addConnection("t2", "sink2").build();
    specGenerator.generateSpec(etlConfig);
}
Also used : ETLBatchConfig(co.cask.cdap.etl.proto.v2.ETLBatchConfig) ETLStage(co.cask.cdap.etl.proto.v2.ETLStage) Test(org.junit.Test)

Example 62 with ETLStage

use of co.cask.cdap.etl.proto.v2.ETLStage in project cdap by caskdata.

the class PipelineSpecGeneratorTest method testConnectionIntoSource.

@Test(expected = IllegalArgumentException.class)
public void testConnectionIntoSource() {
    ETLBatchConfig etlConfig = ETLBatchConfig.builder("* * * * *").addStage(new ETLStage("source", MOCK_SOURCE)).addStage(new ETLStage("sink", MOCK_SINK)).addStage(new ETLStage("transform", MOCK_TRANSFORM_A)).addConnection("source", "sink").addConnection("transform", "source").build();
    specGenerator.generateSpec(etlConfig);
}
Also used : ETLBatchConfig(co.cask.cdap.etl.proto.v2.ETLBatchConfig) ETLStage(co.cask.cdap.etl.proto.v2.ETLStage) Test(org.junit.Test)

Example 63 with ETLStage

use of co.cask.cdap.etl.proto.v2.ETLStage in project cdap by caskdata.

the class PipelineSpecGeneratorTest method testUniqueStageNames.

@Test(expected = IllegalArgumentException.class)
public void testUniqueStageNames() {
    ETLBatchConfig etlConfig = ETLBatchConfig.builder("* * * * *").addStage(new ETLStage("source", MOCK_SOURCE)).addStage(new ETLStage("t1", MOCK_TRANSFORM_A)).addStage(new ETLStage("t1", MOCK_TRANSFORM_B)).addStage(new ETLStage("sink", MOCK_SINK)).addConnection("source", "sink").build();
    specGenerator.generateSpec(etlConfig);
}
Also used : ETLBatchConfig(co.cask.cdap.etl.proto.v2.ETLBatchConfig) ETLStage(co.cask.cdap.etl.proto.v2.ETLStage) Test(org.junit.Test)

Example 64 with ETLStage

use of co.cask.cdap.etl.proto.v2.ETLStage in project cdap by caskdata.

the class PipelineSpecGeneratorTest method testConnectionWithMissingStage.

@Test(expected = IllegalArgumentException.class)
public void testConnectionWithMissingStage() {
    ETLBatchConfig etlConfig = ETLBatchConfig.builder("* * * * *").addStage(new ETLStage("source", MOCK_SOURCE)).addStage(new ETLStage("sink", MOCK_SINK)).addConnection("source", "sink").addConnection("source", "stage2").build();
    specGenerator.generateSpec(etlConfig);
}
Also used : ETLBatchConfig(co.cask.cdap.etl.proto.v2.ETLBatchConfig) ETLStage(co.cask.cdap.etl.proto.v2.ETLStage) Test(org.junit.Test)

Example 65 with ETLStage

use of co.cask.cdap.etl.proto.v2.ETLStage in project cdap by caskdata.

the class PipelineSpecGenerator method configureStages.

/**
 * Performs most of the validation and configuration needed by a pipeline.
 * Handles stages, connections, resources, and stage logging settings.
 *
 * @param config user provided ETL config
 * @param specBuilder builder for creating a pipeline spec.
 */
protected void configureStages(ETLConfig config, PipelineSpec.Builder specBuilder) {
    // validate the config and determine the order we should configure the stages in.
    ValidatedPipeline validatedPipeline = validateConfig(config);
    List<ETLStage> traversalOrder = validatedPipeline.getTraversalOrder();
    Map<String, DefaultPipelineConfigurer<T>> pluginConfigurers = new HashMap<>(traversalOrder.size());
    Map<String, String> pluginTypes = new HashMap<>(traversalOrder.size());
    for (ETLStage stage : traversalOrder) {
        String stageName = stage.getName();
        pluginTypes.put(stageName, stage.getPlugin().getType());
        pluginConfigurers.put(stageName, new DefaultPipelineConfigurer<>(configurer, stageName, engine));
    }
    // anything prefixed by 'system.[engine].' is a pipeline property.
    Map<String, String> pipelineProperties = new HashMap<>();
    String prefix = String.format("system.%s.", engine.name().toLowerCase());
    int prefixLength = prefix.length();
    for (Map.Entry<String, String> property : config.getProperties().entrySet()) {
        if (property.getKey().startsWith(prefix)) {
            String strippedKey = property.getKey().substring(prefixLength);
            pipelineProperties.put(strippedKey, property.getValue());
        }
    }
    // row = property name, column = property value, val = stage that set the property
    // this is used so that we can error with a nice message about which stages are setting conflicting properties
    Table<String, String, String> propertiesFromStages = HashBasedTable.create();
    // configure the stages in order and build up the stage specs
    for (ETLStage stage : traversalOrder) {
        String stageName = stage.getName();
        DefaultPipelineConfigurer<T> pluginConfigurer = pluginConfigurers.get(stageName);
        ConfiguredStage configuredStage = configureStage(stage, validatedPipeline, pluginConfigurer);
        // for each output, set their input schema to our output schema
        for (String nextStageName : validatedPipeline.getOutputs(stageName)) {
            String nextStageType = pluginTypes.get(nextStageName);
            DefaultStageConfigurer outputStageConfigurer = pluginConfigurers.get(nextStageName).getStageConfigurer();
            // if the output stage is an error transform, it takes the error schema of this stage as its input.
            // if the current stage is a splitter transform, it takes the output schema of the port it is connected to
            // all other plugin types that the output schema of this stage as its input.
            Schema nextStageInputSchema;
            if (ErrorTransform.PLUGIN_TYPE.equals(nextStageType)) {
                nextStageInputSchema = configuredStage.stageSpec.getErrorSchema();
            } else if (SplitterTransform.PLUGIN_TYPE.equals(configuredStage.stageSpec.getPlugin().getType())) {
                StageSpec.Port portSpec = configuredStage.stageSpec.getOutputPorts().get(nextStageName);
                // this can happen if the ports are dependent on the data received by the plugin
                if (portSpec == null) {
                    nextStageInputSchema = null;
                } else if (portSpec.getPort() == null) {
                    // Should not happen since it should have been validated earlier, but check here just in case
                    throw new IllegalArgumentException(String.format("Must specify a port when connecting Splitter '%s' to '%s'", stageName, nextStageName));
                } else {
                    nextStageInputSchema = portSpec.getSchema();
                }
            } else {
                nextStageInputSchema = configuredStage.stageSpec.getOutputSchema();
            }
            // Do not allow null input schema for Joiner
            if (BatchJoiner.PLUGIN_TYPE.equals(nextStageType) && nextStageInputSchema == null) {
                throw new IllegalArgumentException(String.format("Joiner cannot have any null input schemas, but stage %s " + "outputs a null schema.", stageName));
            }
            // Do not allow more than one input schema for stages other than Joiner and Action
            if (!BatchJoiner.PLUGIN_TYPE.equals(nextStageType) && !Action.PLUGIN_TYPE.equals(nextStageType) && !Condition.PLUGIN_TYPE.equals(nextStageType) && !hasSameSchema(outputStageConfigurer.getInputSchemas(), nextStageInputSchema)) {
                throw new IllegalArgumentException("Two different input schema were set for the stage " + nextStageName);
            }
            outputStageConfigurer.addInputSchema(stageName, nextStageInputSchema);
        }
        specBuilder.addStage(configuredStage.stageSpec);
        for (Map.Entry<String, String> propertyEntry : configuredStage.pipelineProperties.entrySet()) {
            propertiesFromStages.put(propertyEntry.getKey(), propertyEntry.getValue(), stageName);
        }
    }
    // check that multiple stages did not set conflicting properties
    for (String propertyName : propertiesFromStages.rowKeySet()) {
        // go through all values set for the property name. If there is more than one, we have a conflict.
        Map<String, String> propertyValues = propertiesFromStages.row(propertyName);
        if (propertyValues.size() > 1) {
            StringBuilder errMsg = new StringBuilder("Pipeline property '").append(propertyName).append("' is being set to different values by stages.");
            for (Map.Entry<String, String> valueEntry : propertyValues.entrySet()) {
                String propertyValue = valueEntry.getKey();
                String fromStage = valueEntry.getValue();
                errMsg.append(" stage '").append(fromStage).append("' = '").append(propertyValue).append("',");
            }
            errMsg.deleteCharAt(errMsg.length() - 1);
            throw new IllegalArgumentException(errMsg.toString());
        }
        pipelineProperties.put(propertyName, propertyValues.keySet().iterator().next());
    }
    specBuilder.addConnections(config.getConnections()).setResources(config.getResources()).setDriverResources(config.getDriverResources()).setClientResources(config.getClientResources()).setStageLoggingEnabled(config.isStageLoggingEnabled()).setNumOfRecordsPreview(config.getNumOfRecordsPreview()).setProperties(pipelineProperties).build();
}
Also used : HashMap(java.util.HashMap) Schema(co.cask.cdap.api.data.schema.Schema) DefaultStageConfigurer(co.cask.cdap.etl.common.DefaultStageConfigurer) ETLStage(co.cask.cdap.etl.proto.v2.ETLStage) DefaultPipelineConfigurer(co.cask.cdap.etl.common.DefaultPipelineConfigurer) HashMap(java.util.HashMap) Map(java.util.Map)

Aggregations

ETLStage (co.cask.cdap.etl.proto.v2.ETLStage)94 ETLBatchConfig (co.cask.cdap.etl.proto.v2.ETLBatchConfig)75 Test (org.junit.Test)64 ApplicationId (co.cask.cdap.proto.id.ApplicationId)62 ApplicationManager (co.cask.cdap.test.ApplicationManager)58 AppRequest (co.cask.cdap.proto.artifact.AppRequest)57 Schema (co.cask.cdap.api.data.schema.Schema)51 StructuredRecord (co.cask.cdap.api.data.format.StructuredRecord)50 Table (co.cask.cdap.api.dataset.table.Table)49 WorkflowManager (co.cask.cdap.test.WorkflowManager)44 KeyValueTable (co.cask.cdap.api.dataset.lib.KeyValueTable)39 HashSet (java.util.HashSet)16 ArrayList (java.util.ArrayList)15 ETLPlugin (co.cask.cdap.etl.proto.v2.ETLPlugin)14 HashMap (java.util.HashMap)14 DataStreamsConfig (co.cask.cdap.etl.proto.v2.DataStreamsConfig)11 TimeoutException (java.util.concurrent.TimeoutException)11 TopicNotFoundException (co.cask.cdap.api.messaging.TopicNotFoundException)7 SparkManager (co.cask.cdap.test.SparkManager)7 BatchPipelineSpec (co.cask.cdap.etl.batch.BatchPipelineSpec)6