Search in sources :

Example 1 with DefaultStageConfigurer

use of co.cask.cdap.etl.common.DefaultStageConfigurer in project cdap by caskdata.

the class PipelineSpecGenerator method configureStages.

/**
   * Performs most of the validation and configuration needed by a pipeline.
   * Handles stages, connections, resources, and stage logging settings.
   *
   * @param config user provided ETL config
   * @param specBuilder builder for creating a pipeline spec.
   */
protected void configureStages(ETLConfig config, PipelineSpec.Builder specBuilder) {
    // validate the config and determine the order we should configure the stages in.
    List<StageConnections> traversalOrder = validateConfig(config);
    Map<String, DefaultPipelineConfigurer> pluginConfigurers = new HashMap<>(traversalOrder.size());
    Map<String, String> pluginTypes = new HashMap<>(traversalOrder.size());
    for (StageConnections stageConnections : traversalOrder) {
        String stageName = stageConnections.getStage().getName();
        pluginTypes.put(stageName, stageConnections.getStage().getPlugin().getType());
        pluginConfigurers.put(stageName, new DefaultPipelineConfigurer(configurer, stageName, engine));
    }
    // anything prefixed by 'system.[engine].' is a pipeline property.
    Map<String, String> pipelineProperties = new HashMap<>();
    String prefix = String.format("system.%s.", engine.name().toLowerCase());
    int prefixLength = prefix.length();
    for (Map.Entry<String, String> property : config.getProperties().entrySet()) {
        if (property.getKey().startsWith(prefix)) {
            String strippedKey = property.getKey().substring(prefixLength);
            pipelineProperties.put(strippedKey, property.getValue());
        }
    }
    // row = property name, column = property value, val = stage that set the property
    // this is used so that we can error with a nice message about which stages are setting conflicting properties
    Table<String, String, String> propertiesFromStages = HashBasedTable.create();
    // configure the stages in order and build up the stage specs
    for (StageConnections stageConnections : traversalOrder) {
        ETLStage stage = stageConnections.getStage();
        String stageName = stage.getName();
        DefaultPipelineConfigurer pluginConfigurer = pluginConfigurers.get(stageName);
        ConfiguredStage configuredStage = configureStage(stageConnections, pluginConfigurer);
        Schema outputSchema = configuredStage.stageSpec.getOutputSchema();
        Schema outputErrorSchema = configuredStage.stageSpec.getErrorSchema();
        // for each output, set their input schema to our output schema
        for (String outputStageName : stageConnections.getOutputs()) {
            String outputStageType = pluginTypes.get(outputStageName);
            // no need to set any input schemas for an Action plug
            if (Action.PLUGIN_TYPE.equals(outputStageType)) {
                continue;
            }
            DefaultStageConfigurer outputStageConfigurer = pluginConfigurers.get(outputStageName).getStageConfigurer();
            // Do not allow null input schema for Joiner
            if (BatchJoiner.PLUGIN_TYPE.equals(outputStageType) && outputSchema == null) {
                throw new IllegalArgumentException(String.format("Joiner cannot have any null input schemas, but stage %s " + "outputs a null schema.", stageName));
            }
            // if the output stage is an error transform, it takes the error schema of this stage as its input.
            // all other plugin types that the output schema of this stage as its input.
            Schema nextStageInputSchema = ErrorTransform.PLUGIN_TYPE.equals(outputStageType) ? outputErrorSchema : outputSchema;
            // Do not allow more than one input schema for stages other than Joiner
            if (!BatchJoiner.PLUGIN_TYPE.equals(outputStageType) && !hasSameSchema(outputStageConfigurer.getInputSchemas(), nextStageInputSchema)) {
                throw new IllegalArgumentException("Two different input schema were set for the stage " + outputStageName);
            }
            outputStageConfigurer.addInputSchema(stageName, nextStageInputSchema);
        }
        specBuilder.addStage(configuredStage.stageSpec);
        for (Map.Entry<String, String> propertyEntry : configuredStage.pipelineProperties.entrySet()) {
            propertiesFromStages.put(propertyEntry.getKey(), propertyEntry.getValue(), stageName);
        }
    }
    // check that multiple stages did not set conflicting properties
    for (String propertyName : propertiesFromStages.rowKeySet()) {
        // go through all values set for the property name. If there is more than one, we have a conflict.
        Map<String, String> propertyValues = propertiesFromStages.row(propertyName);
        if (propertyValues.size() > 1) {
            StringBuilder errMsg = new StringBuilder("Pipeline property '").append(propertyName).append("' is being set to different values by stages.");
            for (Map.Entry<String, String> valueEntry : propertyValues.entrySet()) {
                String propertyValue = valueEntry.getKey();
                String fromStage = valueEntry.getValue();
                errMsg.append(" stage '").append(fromStage).append("' = '").append(propertyValue).append("',");
            }
            errMsg.deleteCharAt(errMsg.length() - 1);
            throw new IllegalArgumentException(errMsg.toString());
        }
        pipelineProperties.put(propertyName, propertyValues.keySet().iterator().next());
    }
    specBuilder.addConnections(config.getConnections()).setResources(config.getResources()).setDriverResources(config.getDriverResources()).setClientResources(config.getClientResources()).setStageLoggingEnabled(config.isStageLoggingEnabled()).setNumOfRecordsPreview(config.getNumOfRecordsPreview()).setProperties(pipelineProperties).build();
}
Also used : HashMap(java.util.HashMap) Schema(co.cask.cdap.api.data.schema.Schema) DefaultStageConfigurer(co.cask.cdap.etl.common.DefaultStageConfigurer) ETLStage(co.cask.cdap.etl.proto.v2.ETLStage) DefaultPipelineConfigurer(co.cask.cdap.etl.common.DefaultPipelineConfigurer) HashMap(java.util.HashMap) Map(java.util.Map)

Aggregations

Schema (co.cask.cdap.api.data.schema.Schema)1 DefaultPipelineConfigurer (co.cask.cdap.etl.common.DefaultPipelineConfigurer)1 DefaultStageConfigurer (co.cask.cdap.etl.common.DefaultStageConfigurer)1 ETLStage (co.cask.cdap.etl.proto.v2.ETLStage)1 HashMap (java.util.HashMap)1 Map (java.util.Map)1