use of io.cdap.cdap.etl.proto.v2.spec.StageSpec in project cdap by caskdata.
the class TransformRunner method getSinkWriter.
// this is needed because we need to write to the context differently depending on the number of outputs
private OutputWriter<Object, Object> getSinkWriter(MapReduceTaskContext<Object, Object> context, PipelinePhase pipelinePhase, Configuration hConf) {
Set<StageSpec> reducers = pipelinePhase.getStagesOfType(BatchAggregator.PLUGIN_TYPE, BatchJoiner.PLUGIN_TYPE);
JobContext hadoopContext = context.getHadoopContext();
if (!reducers.isEmpty() && hadoopContext instanceof Mapper.Context) {
return new SingleOutputWriter<>(context);
}
String sinkOutputsStr = hConf.get(ETLMapReduce.SINK_OUTPUTS_KEY);
// should never happen, this is set in initialize
Preconditions.checkNotNull(sinkOutputsStr, "Sink outputs not found in Hadoop conf.");
Map<String, SinkOutput> sinkOutputs = GSON.fromJson(sinkOutputsStr, ETLMapReduce.SINK_OUTPUTS_TYPE);
return hasSingleOutput(sinkOutputs) ? new SingleOutputWriter<>(context) : new MultiOutputWriter<>(context, sinkOutputs);
}
use of io.cdap.cdap.etl.proto.v2.spec.StageSpec in project cdap by caskdata.
the class PipelinePhase method registerPlugins.
/**
* Registers all the plugin to the given pluginConfigurer by calling {@link PluginConfigurer#usePluginClass(String,
* String, String, PluginProperties, PluginSelector)}
*
* @param pluginConfigurer the {@link PluginConfigurer} to which the plugins in this {@link PipelinePhase} needs to be
* registered
* @param runtimeConfigurer the runtime configurer to provide runtime arguments to resolve macro better, null
* if this is the initial deploy
* @param namespace namespace the app is deployed
*/
public void registerPlugins(PluginConfigurer pluginConfigurer, @Nullable RuntimeConfigurer runtimeConfigurer, String namespace) {
MacroParserOptions options = MacroParserOptions.builder().skipInvalidMacros().setEscaping(false).setFunctionWhitelist(ConnectionMacroEvaluator.FUNCTION_NAME).build();
MacroEvaluator runtimeEvaluator = null;
if (runtimeConfigurer != null) {
Map<String, MacroEvaluator> evaluators = Collections.singletonMap(ConnectionMacroEvaluator.FUNCTION_NAME, new ConnectionMacroEvaluator(namespace, runtimeConfigurer));
runtimeEvaluator = new DefaultMacroEvaluator(new BasicArguments(runtimeConfigurer.getRuntimeArguments()), evaluators, Collections.singleton(ConnectionMacroEvaluator.FUNCTION_NAME));
}
for (StageSpec stageSpec : stagesByName.values()) {
// we don't need to register connectors only source, sink and transform plugins
if (stageSpec.getPluginType().equals(Constants.Connector.PLUGIN_TYPE)) {
continue;
}
PluginSpec pluginSpec = stageSpec.getPlugin();
ArtifactVersion version = pluginSpec.getArtifact().getVersion();
ArtifactSelector artifactSelector = new ArtifactSelector(pluginSpec.getArtifact().getScope(), pluginSpec.getArtifact().getName(), new ArtifactVersionRange(version, true, version, true));
Map<String, String> prop = pluginSpec.getProperties();
pluginConfigurer.usePluginClass(pluginSpec.getType(), pluginSpec.getName(), stageSpec.getName(), PluginProperties.builder().addAll(runtimeConfigurer == null ? prop : pluginConfigurer.evaluateMacros(prop, runtimeEvaluator, options)).build(), artifactSelector);
}
}
use of io.cdap.cdap.etl.proto.v2.spec.StageSpec in project cdap by caskdata.
the class FieldLineageProcessorTest method testGeneratedOperations.
@Test
public void testGeneratedOperations() throws Exception {
// src -> transform1 -> transform2 -> sink
Schema srcSchema = Schema.recordOf("srcSchema", Schema.Field.of("body", Schema.of(Schema.Type.STRING)), Schema.Field.of("offset", Schema.of(Schema.Type.INT)));
Schema transform1Schema = Schema.recordOf("trans1Schema", Schema.Field.of("body", Schema.of(Schema.Type.STRING)));
Schema transform2Schema = Schema.recordOf("trans2Schema", Schema.Field.of("id", Schema.of(Schema.Type.INT)), Schema.Field.of("name", Schema.of(Schema.Type.STRING)));
Set<StageSpec> stageSpecs = ImmutableSet.of(StageSpec.builder("src", DUMMY_PLUGIN).addOutput(srcSchema, "transform1").build(), StageSpec.builder("transform1", DUMMY_PLUGIN).addInputSchema("src", srcSchema).addOutput(transform1Schema, "transform2").build(), StageSpec.builder("transform2", DUMMY_PLUGIN).addInputSchema("transform1", transform1Schema).addOutput(transform2Schema, "sink").build(), StageSpec.builder("sink", DUMMY_PLUGIN).addInputSchema("transform2", transform2Schema).build());
Set<Connection> connections = ImmutableSet.of(new Connection("src", "transform1"), new Connection("transform1", "transform2"), new Connection("transform2", "sink"));
PipelineSpec pipelineSpec = PipelineSpec.builder().addStages(stageSpecs).addConnections(connections).build();
FieldLineageProcessor processor = new FieldLineageProcessor(pipelineSpec);
Map<String, List<FieldOperation>> fieldOperations = ImmutableMap.of("src", Collections.singletonList(new FieldReadOperation("Read", "1st operation", EndPoint.of("file"), ImmutableList.of("body", "offset"))), "transform1", Collections.emptyList(), "transform2", Collections.emptyList(), "sink", Collections.singletonList(new FieldWriteOperation("Write", "4th operation", EndPoint.of("sink"), ImmutableList.of("id", "name"))));
Set<Operation> operations = processor.validateAndConvert(fieldOperations);
Set<Operation> expected = ImmutableSet.of(new ReadOperation("src.Read", "1st operation", EndPoint.of("file"), ImmutableList.of("body", "offset")), new TransformOperation("transform1.Transform", "", ImmutableList.of(InputField.of("src.Read", "body"), InputField.of("src.Read", "offset")), "body"), new TransformOperation("transform2.Transform", "", ImmutableList.of(InputField.of("transform1.Transform", "body")), ImmutableList.of("id", "name")), new WriteOperation("sink.Write", "4th operation", EndPoint.of("sink"), ImmutableList.of(InputField.of("transform2.Transform", "id"), InputField.of("transform2.Transform", "name"))));
Assert.assertEquals(expected, operations);
}
use of io.cdap.cdap.etl.proto.v2.spec.StageSpec in project cdap by caskdata.
the class PipelinePlanner method dagToPipeline.
/**
* Converts a Dag into a PipelinePhase, using what we know about the plugin type of each node in the dag.
* The PipelinePhase is what programs will take as input, and keeps track of sources, transforms, sinks, etc.
*
* @param dag the dag to convert
* @param connectors connector nodes across all dags
* @param specs specifications for every stage
* @return the converted dag
*/
private PipelinePhase dagToPipeline(Dag dag, Map<String, String> connectors, Map<String, StageSpec> specs, Map<String, String> conditionConnectors) {
PipelinePhase.Builder phaseBuilder = PipelinePhase.builder(supportedPluginTypes);
for (String stageName : dag.getTopologicalOrder()) {
Set<String> outputs = dag.getNodeOutputs(stageName);
if (!outputs.isEmpty()) {
phaseBuilder.addConnections(stageName, outputs);
}
// add connectors
String originalName = connectors.get(stageName);
if (originalName != null || conditionConnectors.values().contains(stageName)) {
String connectorType = dag.getSources().contains(stageName) ? Constants.Connector.SOURCE_TYPE : Constants.Connector.SINK_TYPE;
PluginSpec connectorSpec = new PluginSpec(Constants.Connector.PLUGIN_TYPE, "connector", ImmutableMap.of(Constants.Connector.ORIGINAL_NAME, originalName != null ? originalName : stageName, Constants.Connector.TYPE, connectorType), null);
phaseBuilder.addStage(StageSpec.builder(stageName, connectorSpec).build());
continue;
}
// add other plugin types
StageSpec spec = specs.get(stageName);
phaseBuilder.addStage(spec);
}
return phaseBuilder.build();
}
use of io.cdap.cdap.etl.proto.v2.spec.StageSpec in project cdap by caskdata.
the class TransformExecutorFactory method getPipeStage.
private PipeStage getPipeStage(PipelinePhase pipeline, String stageName, Map<String, PipeStage> pipeStages) throws Exception {
StageSpec stageSpec = pipeline.getStage(stageName);
String pluginType = stageSpec.getPluginType();
// handle ending stage case, which don't use PipeEmitter
if (pipeline.getSinks().contains(stageName)) {
return getSinkPipeStage(stageSpec);
}
// create PipeEmitter, which holds all output PipeStages it needs to write to and wraps any output it gets
// into a RecordInfo
// ConnectorSources require a special emitter since they need to build RecordInfo from the temporary dataset
PipeEmitter.Builder emitterBuilder = Constants.Connector.PLUGIN_TYPE.equals(pluginType) && pipeline.getSources().contains(stageName) ? ConnectorSourceEmitter.builder(stageName) : PipeEmitter.builder(stageName);
Map<String, StageSpec.Port> outputPorts = stageSpec.getOutputPorts();
for (String outputStageName : pipeline.getStageOutputs(stageName)) {
StageSpec outputStageSpec = pipeline.getStage(outputStageName);
String outputStageType = outputStageSpec.getPluginType();
PipeStage outputPipeStage = pipeStages.get(outputStageName);
if (ErrorTransform.PLUGIN_TYPE.equals(outputStageType)) {
emitterBuilder.addErrorConsumer(outputPipeStage);
} else if (AlertPublisher.PLUGIN_TYPE.equals(outputStageType)) {
emitterBuilder.addAlertConsumer(outputPipeStage);
} else if (Constants.Connector.PLUGIN_TYPE.equals(pluginType)) {
// connectors only have a single output
emitterBuilder.addOutputConsumer(outputPipeStage);
} else {
// if the output is a connector like agg5.connector, the outputPorts will contain the original 'agg5' as
// a key, but not 'agg5.connector' so we need to lookup the original stage from the connector's plugin spec
String originalOutputName = Constants.Connector.PLUGIN_TYPE.equals(outputStageType) ? outputStageSpec.getPlugin().getProperties().get(Constants.Connector.ORIGINAL_NAME) : outputStageName;
String port = outputPorts.containsKey(originalOutputName) ? outputPorts.get(originalOutputName).getPort() : null;
if (port != null) {
emitterBuilder.addOutputConsumer(outputPipeStage, port);
} else {
emitterBuilder.addOutputConsumer(outputPipeStage);
}
}
}
PipeEmitter pipeEmitter = emitterBuilder.build();
if (SplitterTransform.PLUGIN_TYPE.equals(pluginType)) {
// this is a SplitterTransform, needs to emit records to the right outputs based on port
return new MultiOutputTransformPipeStage<>(stageName, getMultiOutputTransform(stageSpec), pipeEmitter);
} else {
return new UnwrapPipeStage<>(stageName, getTransformation(stageSpec), pipeEmitter);
}
}
Aggregations