Search in sources :

Example 6 with StageSpec

use of io.cdap.cdap.etl.proto.v2.spec.StageSpec in project cdap by caskdata.

the class TransformRunner method getSinkWriter.

// this is needed because we need to write to the context differently depending on the number of outputs
private OutputWriter<Object, Object> getSinkWriter(MapReduceTaskContext<Object, Object> context, PipelinePhase pipelinePhase, Configuration hConf) {
    Set<StageSpec> reducers = pipelinePhase.getStagesOfType(BatchAggregator.PLUGIN_TYPE, BatchJoiner.PLUGIN_TYPE);
    JobContext hadoopContext = context.getHadoopContext();
    if (!reducers.isEmpty() && hadoopContext instanceof Mapper.Context) {
        return new SingleOutputWriter<>(context);
    }
    String sinkOutputsStr = hConf.get(ETLMapReduce.SINK_OUTPUTS_KEY);
    // should never happen, this is set in initialize
    Preconditions.checkNotNull(sinkOutputsStr, "Sink outputs not found in Hadoop conf.");
    Map<String, SinkOutput> sinkOutputs = GSON.fromJson(sinkOutputsStr, ETLMapReduce.SINK_OUTPUTS_TYPE);
    return hasSingleOutput(sinkOutputs) ? new SingleOutputWriter<>(context) : new MultiOutputWriter<>(context, sinkOutputs);
}
Also used : Mapper(org.apache.hadoop.mapreduce.Mapper) StageSpec(io.cdap.cdap.etl.proto.v2.spec.StageSpec) JobContext(org.apache.hadoop.mapreduce.JobContext)

Example 7 with StageSpec

use of io.cdap.cdap.etl.proto.v2.spec.StageSpec in project cdap by caskdata.

the class PipelinePhase method registerPlugins.

/**
 * Registers all the plugin to the given pluginConfigurer by calling {@link PluginConfigurer#usePluginClass(String,
 * String, String, PluginProperties, PluginSelector)}
 *
 * @param pluginConfigurer the {@link PluginConfigurer} to which the plugins in this {@link PipelinePhase} needs to be
 * registered
 * @param runtimeConfigurer the runtime configurer to provide runtime arguments to resolve macro better, null
 *                          if this is the initial deploy
 * @param namespace namespace the app is deployed
 */
public void registerPlugins(PluginConfigurer pluginConfigurer, @Nullable RuntimeConfigurer runtimeConfigurer, String namespace) {
    MacroParserOptions options = MacroParserOptions.builder().skipInvalidMacros().setEscaping(false).setFunctionWhitelist(ConnectionMacroEvaluator.FUNCTION_NAME).build();
    MacroEvaluator runtimeEvaluator = null;
    if (runtimeConfigurer != null) {
        Map<String, MacroEvaluator> evaluators = Collections.singletonMap(ConnectionMacroEvaluator.FUNCTION_NAME, new ConnectionMacroEvaluator(namespace, runtimeConfigurer));
        runtimeEvaluator = new DefaultMacroEvaluator(new BasicArguments(runtimeConfigurer.getRuntimeArguments()), evaluators, Collections.singleton(ConnectionMacroEvaluator.FUNCTION_NAME));
    }
    for (StageSpec stageSpec : stagesByName.values()) {
        // we don't need to register connectors only source, sink and transform plugins
        if (stageSpec.getPluginType().equals(Constants.Connector.PLUGIN_TYPE)) {
            continue;
        }
        PluginSpec pluginSpec = stageSpec.getPlugin();
        ArtifactVersion version = pluginSpec.getArtifact().getVersion();
        ArtifactSelector artifactSelector = new ArtifactSelector(pluginSpec.getArtifact().getScope(), pluginSpec.getArtifact().getName(), new ArtifactVersionRange(version, true, version, true));
        Map<String, String> prop = pluginSpec.getProperties();
        pluginConfigurer.usePluginClass(pluginSpec.getType(), pluginSpec.getName(), stageSpec.getName(), PluginProperties.builder().addAll(runtimeConfigurer == null ? prop : pluginConfigurer.evaluateMacros(prop, runtimeEvaluator, options)).build(), artifactSelector);
    }
}
Also used : MacroEvaluator(io.cdap.cdap.api.macro.MacroEvaluator) ArtifactVersionRange(io.cdap.cdap.api.artifact.ArtifactVersionRange) MacroParserOptions(io.cdap.cdap.api.macro.MacroParserOptions) PluginSpec(io.cdap.cdap.etl.proto.v2.spec.PluginSpec) ArtifactVersion(io.cdap.cdap.api.artifact.ArtifactVersion) StageSpec(io.cdap.cdap.etl.proto.v2.spec.StageSpec)

Example 8 with StageSpec

use of io.cdap.cdap.etl.proto.v2.spec.StageSpec in project cdap by caskdata.

the class FieldLineageProcessorTest method testGeneratedOperations.

@Test
public void testGeneratedOperations() throws Exception {
    // src -> transform1 -> transform2 -> sink
    Schema srcSchema = Schema.recordOf("srcSchema", Schema.Field.of("body", Schema.of(Schema.Type.STRING)), Schema.Field.of("offset", Schema.of(Schema.Type.INT)));
    Schema transform1Schema = Schema.recordOf("trans1Schema", Schema.Field.of("body", Schema.of(Schema.Type.STRING)));
    Schema transform2Schema = Schema.recordOf("trans2Schema", Schema.Field.of("id", Schema.of(Schema.Type.INT)), Schema.Field.of("name", Schema.of(Schema.Type.STRING)));
    Set<StageSpec> stageSpecs = ImmutableSet.of(StageSpec.builder("src", DUMMY_PLUGIN).addOutput(srcSchema, "transform1").build(), StageSpec.builder("transform1", DUMMY_PLUGIN).addInputSchema("src", srcSchema).addOutput(transform1Schema, "transform2").build(), StageSpec.builder("transform2", DUMMY_PLUGIN).addInputSchema("transform1", transform1Schema).addOutput(transform2Schema, "sink").build(), StageSpec.builder("sink", DUMMY_PLUGIN).addInputSchema("transform2", transform2Schema).build());
    Set<Connection> connections = ImmutableSet.of(new Connection("src", "transform1"), new Connection("transform1", "transform2"), new Connection("transform2", "sink"));
    PipelineSpec pipelineSpec = PipelineSpec.builder().addStages(stageSpecs).addConnections(connections).build();
    FieldLineageProcessor processor = new FieldLineageProcessor(pipelineSpec);
    Map<String, List<FieldOperation>> fieldOperations = ImmutableMap.of("src", Collections.singletonList(new FieldReadOperation("Read", "1st operation", EndPoint.of("file"), ImmutableList.of("body", "offset"))), "transform1", Collections.emptyList(), "transform2", Collections.emptyList(), "sink", Collections.singletonList(new FieldWriteOperation("Write", "4th operation", EndPoint.of("sink"), ImmutableList.of("id", "name"))));
    Set<Operation> operations = processor.validateAndConvert(fieldOperations);
    Set<Operation> expected = ImmutableSet.of(new ReadOperation("src.Read", "1st operation", EndPoint.of("file"), ImmutableList.of("body", "offset")), new TransformOperation("transform1.Transform", "", ImmutableList.of(InputField.of("src.Read", "body"), InputField.of("src.Read", "offset")), "body"), new TransformOperation("transform2.Transform", "", ImmutableList.of(InputField.of("transform1.Transform", "body")), ImmutableList.of("id", "name")), new WriteOperation("sink.Write", "4th operation", EndPoint.of("sink"), ImmutableList.of(InputField.of("transform2.Transform", "id"), InputField.of("transform2.Transform", "name"))));
    Assert.assertEquals(expected, operations);
}
Also used : ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) FieldReadOperation(io.cdap.cdap.etl.api.lineage.field.FieldReadOperation) Schema(io.cdap.cdap.api.data.schema.Schema) Connection(io.cdap.cdap.etl.proto.Connection) ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) FieldOperation(io.cdap.cdap.etl.api.lineage.field.FieldOperation) FieldWriteOperation(io.cdap.cdap.etl.api.lineage.field.FieldWriteOperation) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) FieldReadOperation(io.cdap.cdap.etl.api.lineage.field.FieldReadOperation) Operation(io.cdap.cdap.api.lineage.field.Operation) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) FieldWriteOperation(io.cdap.cdap.etl.api.lineage.field.FieldWriteOperation) FieldReadOperation(io.cdap.cdap.etl.api.lineage.field.FieldReadOperation) FieldWriteOperation(io.cdap.cdap.etl.api.lineage.field.FieldWriteOperation) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) PipelineSpec(io.cdap.cdap.etl.proto.v2.spec.PipelineSpec) StageSpec(io.cdap.cdap.etl.proto.v2.spec.StageSpec) ImmutableList(com.google.common.collect.ImmutableList) List(java.util.List) Test(org.junit.Test)

Example 9 with StageSpec

use of io.cdap.cdap.etl.proto.v2.spec.StageSpec in project cdap by caskdata.

the class PipelinePlanner method dagToPipeline.

/**
 * Converts a Dag into a PipelinePhase, using what we know about the plugin type of each node in the dag.
 * The PipelinePhase is what programs will take as input, and keeps track of sources, transforms, sinks, etc.
 *
 * @param dag the dag to convert
 * @param connectors connector nodes across all dags
 * @param specs specifications for every stage
 * @return the converted dag
 */
private PipelinePhase dagToPipeline(Dag dag, Map<String, String> connectors, Map<String, StageSpec> specs, Map<String, String> conditionConnectors) {
    PipelinePhase.Builder phaseBuilder = PipelinePhase.builder(supportedPluginTypes);
    for (String stageName : dag.getTopologicalOrder()) {
        Set<String> outputs = dag.getNodeOutputs(stageName);
        if (!outputs.isEmpty()) {
            phaseBuilder.addConnections(stageName, outputs);
        }
        // add connectors
        String originalName = connectors.get(stageName);
        if (originalName != null || conditionConnectors.values().contains(stageName)) {
            String connectorType = dag.getSources().contains(stageName) ? Constants.Connector.SOURCE_TYPE : Constants.Connector.SINK_TYPE;
            PluginSpec connectorSpec = new PluginSpec(Constants.Connector.PLUGIN_TYPE, "connector", ImmutableMap.of(Constants.Connector.ORIGINAL_NAME, originalName != null ? originalName : stageName, Constants.Connector.TYPE, connectorType), null);
            phaseBuilder.addStage(StageSpec.builder(stageName, connectorSpec).build());
            continue;
        }
        // add other plugin types
        StageSpec spec = specs.get(stageName);
        phaseBuilder.addStage(spec);
    }
    return phaseBuilder.build();
}
Also used : PluginSpec(io.cdap.cdap.etl.proto.v2.spec.PluginSpec) PipelinePhase(io.cdap.cdap.etl.common.PipelinePhase) StageSpec(io.cdap.cdap.etl.proto.v2.spec.StageSpec)

Example 10 with StageSpec

use of io.cdap.cdap.etl.proto.v2.spec.StageSpec in project cdap by caskdata.

the class TransformExecutorFactory method getPipeStage.

private PipeStage getPipeStage(PipelinePhase pipeline, String stageName, Map<String, PipeStage> pipeStages) throws Exception {
    StageSpec stageSpec = pipeline.getStage(stageName);
    String pluginType = stageSpec.getPluginType();
    // handle ending stage case, which don't use PipeEmitter
    if (pipeline.getSinks().contains(stageName)) {
        return getSinkPipeStage(stageSpec);
    }
    // create PipeEmitter, which holds all output PipeStages it needs to write to and wraps any output it gets
    // into a RecordInfo
    // ConnectorSources require a special emitter since they need to build RecordInfo from the temporary dataset
    PipeEmitter.Builder emitterBuilder = Constants.Connector.PLUGIN_TYPE.equals(pluginType) && pipeline.getSources().contains(stageName) ? ConnectorSourceEmitter.builder(stageName) : PipeEmitter.builder(stageName);
    Map<String, StageSpec.Port> outputPorts = stageSpec.getOutputPorts();
    for (String outputStageName : pipeline.getStageOutputs(stageName)) {
        StageSpec outputStageSpec = pipeline.getStage(outputStageName);
        String outputStageType = outputStageSpec.getPluginType();
        PipeStage outputPipeStage = pipeStages.get(outputStageName);
        if (ErrorTransform.PLUGIN_TYPE.equals(outputStageType)) {
            emitterBuilder.addErrorConsumer(outputPipeStage);
        } else if (AlertPublisher.PLUGIN_TYPE.equals(outputStageType)) {
            emitterBuilder.addAlertConsumer(outputPipeStage);
        } else if (Constants.Connector.PLUGIN_TYPE.equals(pluginType)) {
            // connectors only have a single output
            emitterBuilder.addOutputConsumer(outputPipeStage);
        } else {
            // if the output is a connector like agg5.connector, the outputPorts will contain the original 'agg5' as
            // a key, but not 'agg5.connector' so we need to lookup the original stage from the connector's plugin spec
            String originalOutputName = Constants.Connector.PLUGIN_TYPE.equals(outputStageType) ? outputStageSpec.getPlugin().getProperties().get(Constants.Connector.ORIGINAL_NAME) : outputStageName;
            String port = outputPorts.containsKey(originalOutputName) ? outputPorts.get(originalOutputName).getPort() : null;
            if (port != null) {
                emitterBuilder.addOutputConsumer(outputPipeStage, port);
            } else {
                emitterBuilder.addOutputConsumer(outputPipeStage);
            }
        }
    }
    PipeEmitter pipeEmitter = emitterBuilder.build();
    if (SplitterTransform.PLUGIN_TYPE.equals(pluginType)) {
        // this is a SplitterTransform, needs to emit records to the right outputs based on port
        return new MultiOutputTransformPipeStage<>(stageName, getMultiOutputTransform(stageSpec), pipeEmitter);
    } else {
        return new UnwrapPipeStage<>(stageName, getTransformation(stageSpec), pipeEmitter);
    }
}
Also used : StageSpec(io.cdap.cdap.etl.proto.v2.spec.StageSpec)

Aggregations

StageSpec (io.cdap.cdap.etl.proto.v2.spec.StageSpec)74 HashMap (java.util.HashMap)42 PipelinePhase (io.cdap.cdap.etl.common.PipelinePhase)30 HashSet (java.util.HashSet)24 Map (java.util.Map)24 MacroEvaluator (io.cdap.cdap.api.macro.MacroEvaluator)20 DefaultMacroEvaluator (io.cdap.cdap.etl.common.DefaultMacroEvaluator)20 Connection (io.cdap.cdap.etl.proto.Connection)18 Schema (io.cdap.cdap.api.data.schema.Schema)16 PipelineRuntime (io.cdap.cdap.etl.common.PipelineRuntime)16 ArrayList (java.util.ArrayList)16 BatchPhaseSpec (io.cdap.cdap.etl.batch.BatchPhaseSpec)14 PipelineSpec (io.cdap.cdap.etl.proto.v2.spec.PipelineSpec)14 Test (org.junit.Test)14 PipelinePluginContext (io.cdap.cdap.etl.common.plugin.PipelinePluginContext)12 PluginContext (io.cdap.cdap.api.plugin.PluginContext)10 BasicArguments (io.cdap.cdap.etl.common.BasicArguments)10 List (java.util.List)10 WorkflowToken (io.cdap.cdap.api.workflow.WorkflowToken)8 BatchJoiner (io.cdap.cdap.etl.api.batch.BatchJoiner)8