Search in sources :

Example 1 with DefaultPipelineConfigurer

use of co.cask.cdap.etl.common.DefaultPipelineConfigurer in project cdap by caskdata.

the class PipelineSpecGenerator method configureStages.

/**
 * Performs most of the validation and configuration needed by a pipeline.
 * Handles stages, connections, resources, and stage logging settings.
 *
 * @param config user provided ETL config
 * @param specBuilder builder for creating a pipeline spec.
 */
protected void configureStages(ETLConfig config, PipelineSpec.Builder specBuilder) {
    // validate the config and determine the order we should configure the stages in.
    ValidatedPipeline validatedPipeline = validateConfig(config);
    List<ETLStage> traversalOrder = validatedPipeline.getTraversalOrder();
    Map<String, DefaultPipelineConfigurer<T>> pluginConfigurers = new HashMap<>(traversalOrder.size());
    Map<String, String> pluginTypes = new HashMap<>(traversalOrder.size());
    for (ETLStage stage : traversalOrder) {
        String stageName = stage.getName();
        pluginTypes.put(stageName, stage.getPlugin().getType());
        pluginConfigurers.put(stageName, new DefaultPipelineConfigurer<>(configurer, stageName, engine));
    }
    // anything prefixed by 'system.[engine].' is a pipeline property.
    Map<String, String> pipelineProperties = new HashMap<>();
    String prefix = String.format("system.%s.", engine.name().toLowerCase());
    int prefixLength = prefix.length();
    for (Map.Entry<String, String> property : config.getProperties().entrySet()) {
        if (property.getKey().startsWith(prefix)) {
            String strippedKey = property.getKey().substring(prefixLength);
            pipelineProperties.put(strippedKey, property.getValue());
        }
    }
    // row = property name, column = property value, val = stage that set the property
    // this is used so that we can error with a nice message about which stages are setting conflicting properties
    Table<String, String, String> propertiesFromStages = HashBasedTable.create();
    // configure the stages in order and build up the stage specs
    for (ETLStage stage : traversalOrder) {
        String stageName = stage.getName();
        DefaultPipelineConfigurer<T> pluginConfigurer = pluginConfigurers.get(stageName);
        ConfiguredStage configuredStage = configureStage(stage, validatedPipeline, pluginConfigurer);
        // for each output, set their input schema to our output schema
        for (String nextStageName : validatedPipeline.getOutputs(stageName)) {
            String nextStageType = pluginTypes.get(nextStageName);
            DefaultStageConfigurer outputStageConfigurer = pluginConfigurers.get(nextStageName).getStageConfigurer();
            // if the output stage is an error transform, it takes the error schema of this stage as its input.
            // if the current stage is a splitter transform, it takes the output schema of the port it is connected to
            // all other plugin types that the output schema of this stage as its input.
            Schema nextStageInputSchema;
            if (ErrorTransform.PLUGIN_TYPE.equals(nextStageType)) {
                nextStageInputSchema = configuredStage.stageSpec.getErrorSchema();
            } else if (SplitterTransform.PLUGIN_TYPE.equals(configuredStage.stageSpec.getPlugin().getType())) {
                StageSpec.Port portSpec = configuredStage.stageSpec.getOutputPorts().get(nextStageName);
                // this can happen if the ports are dependent on the data received by the plugin
                if (portSpec == null) {
                    nextStageInputSchema = null;
                } else if (portSpec.getPort() == null) {
                    // Should not happen since it should have been validated earlier, but check here just in case
                    throw new IllegalArgumentException(String.format("Must specify a port when connecting Splitter '%s' to '%s'", stageName, nextStageName));
                } else {
                    nextStageInputSchema = portSpec.getSchema();
                }
            } else {
                nextStageInputSchema = configuredStage.stageSpec.getOutputSchema();
            }
            // Do not allow null input schema for Joiner
            if (BatchJoiner.PLUGIN_TYPE.equals(nextStageType) && nextStageInputSchema == null) {
                throw new IllegalArgumentException(String.format("Joiner cannot have any null input schemas, but stage %s " + "outputs a null schema.", stageName));
            }
            // Do not allow more than one input schema for stages other than Joiner and Action
            if (!BatchJoiner.PLUGIN_TYPE.equals(nextStageType) && !Action.PLUGIN_TYPE.equals(nextStageType) && !Condition.PLUGIN_TYPE.equals(nextStageType) && !hasSameSchema(outputStageConfigurer.getInputSchemas(), nextStageInputSchema)) {
                throw new IllegalArgumentException("Two different input schema were set for the stage " + nextStageName);
            }
            outputStageConfigurer.addInputSchema(stageName, nextStageInputSchema);
        }
        specBuilder.addStage(configuredStage.stageSpec);
        for (Map.Entry<String, String> propertyEntry : configuredStage.pipelineProperties.entrySet()) {
            propertiesFromStages.put(propertyEntry.getKey(), propertyEntry.getValue(), stageName);
        }
    }
    // check that multiple stages did not set conflicting properties
    for (String propertyName : propertiesFromStages.rowKeySet()) {
        // go through all values set for the property name. If there is more than one, we have a conflict.
        Map<String, String> propertyValues = propertiesFromStages.row(propertyName);
        if (propertyValues.size() > 1) {
            StringBuilder errMsg = new StringBuilder("Pipeline property '").append(propertyName).append("' is being set to different values by stages.");
            for (Map.Entry<String, String> valueEntry : propertyValues.entrySet()) {
                String propertyValue = valueEntry.getKey();
                String fromStage = valueEntry.getValue();
                errMsg.append(" stage '").append(fromStage).append("' = '").append(propertyValue).append("',");
            }
            errMsg.deleteCharAt(errMsg.length() - 1);
            throw new IllegalArgumentException(errMsg.toString());
        }
        pipelineProperties.put(propertyName, propertyValues.keySet().iterator().next());
    }
    specBuilder.addConnections(config.getConnections()).setResources(config.getResources()).setDriverResources(config.getDriverResources()).setClientResources(config.getClientResources()).setStageLoggingEnabled(config.isStageLoggingEnabled()).setNumOfRecordsPreview(config.getNumOfRecordsPreview()).setProperties(pipelineProperties).build();
}
Also used : HashMap(java.util.HashMap) Schema(co.cask.cdap.api.data.schema.Schema) DefaultStageConfigurer(co.cask.cdap.etl.common.DefaultStageConfigurer) ETLStage(co.cask.cdap.etl.proto.v2.ETLStage) DefaultPipelineConfigurer(co.cask.cdap.etl.common.DefaultPipelineConfigurer) HashMap(java.util.HashMap) Map(java.util.Map)

Example 2 with DefaultPipelineConfigurer

use of co.cask.cdap.etl.common.DefaultPipelineConfigurer in project cdap by caskdata.

the class BatchPipelineSpecGenerator method generateSpec.

@Override
public BatchPipelineSpec generateSpec(ETLBatchConfig config) {
    BatchPipelineSpec.Builder specBuilder = BatchPipelineSpec.builder();
    for (ETLStage endingAction : config.getPostActions()) {
        String name = endingAction.getName();
        DefaultPipelineConfigurer<T> pipelineConfigurer = new DefaultPipelineConfigurer<>(configurer, name, engine);
        PluginSpec pluginSpec = configurePlugin(endingAction.getName(), endingAction.getPlugin(), pipelineConfigurer);
        specBuilder.addAction(new ActionSpec(name, pluginSpec));
    }
    configureStages(config, specBuilder);
    return specBuilder.build();
}
Also used : PluginSpec(co.cask.cdap.etl.spec.PluginSpec) ETLStage(co.cask.cdap.etl.proto.v2.ETLStage) DefaultPipelineConfigurer(co.cask.cdap.etl.common.DefaultPipelineConfigurer)

Aggregations

DefaultPipelineConfigurer (co.cask.cdap.etl.common.DefaultPipelineConfigurer)2 ETLStage (co.cask.cdap.etl.proto.v2.ETLStage)2 Schema (co.cask.cdap.api.data.schema.Schema)1 DefaultStageConfigurer (co.cask.cdap.etl.common.DefaultStageConfigurer)1 PluginSpec (co.cask.cdap.etl.spec.PluginSpec)1 HashMap (java.util.HashMap)1 Map (java.util.Map)1