use of co.cask.cdap.etl.common.DefaultStageConfigurer in project cdap by caskdata.
the class PipelineSpecGenerator method configureStages.
/**
* Performs most of the validation and configuration needed by a pipeline.
* Handles stages, connections, resources, and stage logging settings.
*
* @param config user provided ETL config
* @param specBuilder builder for creating a pipeline spec.
*/
protected void configureStages(ETLConfig config, PipelineSpec.Builder specBuilder) {
// validate the config and determine the order we should configure the stages in.
List<StageConnections> traversalOrder = validateConfig(config);
Map<String, DefaultPipelineConfigurer> pluginConfigurers = new HashMap<>(traversalOrder.size());
Map<String, String> pluginTypes = new HashMap<>(traversalOrder.size());
for (StageConnections stageConnections : traversalOrder) {
String stageName = stageConnections.getStage().getName();
pluginTypes.put(stageName, stageConnections.getStage().getPlugin().getType());
pluginConfigurers.put(stageName, new DefaultPipelineConfigurer(configurer, stageName, engine));
}
// anything prefixed by 'system.[engine].' is a pipeline property.
Map<String, String> pipelineProperties = new HashMap<>();
String prefix = String.format("system.%s.", engine.name().toLowerCase());
int prefixLength = prefix.length();
for (Map.Entry<String, String> property : config.getProperties().entrySet()) {
if (property.getKey().startsWith(prefix)) {
String strippedKey = property.getKey().substring(prefixLength);
pipelineProperties.put(strippedKey, property.getValue());
}
}
// row = property name, column = property value, val = stage that set the property
// this is used so that we can error with a nice message about which stages are setting conflicting properties
Table<String, String, String> propertiesFromStages = HashBasedTable.create();
// configure the stages in order and build up the stage specs
for (StageConnections stageConnections : traversalOrder) {
ETLStage stage = stageConnections.getStage();
String stageName = stage.getName();
DefaultPipelineConfigurer pluginConfigurer = pluginConfigurers.get(stageName);
ConfiguredStage configuredStage = configureStage(stageConnections, pluginConfigurer);
Schema outputSchema = configuredStage.stageSpec.getOutputSchema();
Schema outputErrorSchema = configuredStage.stageSpec.getErrorSchema();
// for each output, set their input schema to our output schema
for (String outputStageName : stageConnections.getOutputs()) {
String outputStageType = pluginTypes.get(outputStageName);
// no need to set any input schemas for an Action plug
if (Action.PLUGIN_TYPE.equals(outputStageType)) {
continue;
}
DefaultStageConfigurer outputStageConfigurer = pluginConfigurers.get(outputStageName).getStageConfigurer();
// Do not allow null input schema for Joiner
if (BatchJoiner.PLUGIN_TYPE.equals(outputStageType) && outputSchema == null) {
throw new IllegalArgumentException(String.format("Joiner cannot have any null input schemas, but stage %s " + "outputs a null schema.", stageName));
}
// if the output stage is an error transform, it takes the error schema of this stage as its input.
// all other plugin types that the output schema of this stage as its input.
Schema nextStageInputSchema = ErrorTransform.PLUGIN_TYPE.equals(outputStageType) ? outputErrorSchema : outputSchema;
// Do not allow more than one input schema for stages other than Joiner
if (!BatchJoiner.PLUGIN_TYPE.equals(outputStageType) && !hasSameSchema(outputStageConfigurer.getInputSchemas(), nextStageInputSchema)) {
throw new IllegalArgumentException("Two different input schema were set for the stage " + outputStageName);
}
outputStageConfigurer.addInputSchema(stageName, nextStageInputSchema);
}
specBuilder.addStage(configuredStage.stageSpec);
for (Map.Entry<String, String> propertyEntry : configuredStage.pipelineProperties.entrySet()) {
propertiesFromStages.put(propertyEntry.getKey(), propertyEntry.getValue(), stageName);
}
}
// check that multiple stages did not set conflicting properties
for (String propertyName : propertiesFromStages.rowKeySet()) {
// go through all values set for the property name. If there is more than one, we have a conflict.
Map<String, String> propertyValues = propertiesFromStages.row(propertyName);
if (propertyValues.size() > 1) {
StringBuilder errMsg = new StringBuilder("Pipeline property '").append(propertyName).append("' is being set to different values by stages.");
for (Map.Entry<String, String> valueEntry : propertyValues.entrySet()) {
String propertyValue = valueEntry.getKey();
String fromStage = valueEntry.getValue();
errMsg.append(" stage '").append(fromStage).append("' = '").append(propertyValue).append("',");
}
errMsg.deleteCharAt(errMsg.length() - 1);
throw new IllegalArgumentException(errMsg.toString());
}
pipelineProperties.put(propertyName, propertyValues.keySet().iterator().next());
}
specBuilder.addConnections(config.getConnections()).setResources(config.getResources()).setDriverResources(config.getDriverResources()).setClientResources(config.getClientResources()).setStageLoggingEnabled(config.isStageLoggingEnabled()).setNumOfRecordsPreview(config.getNumOfRecordsPreview()).setProperties(pipelineProperties).build();
}
Aggregations