Search in sources :

Example 21 with StageSpec

use of io.cdap.cdap.etl.proto.v2.spec.StageSpec in project cdap by caskdata.

the class SparkPipelineRunner method handleJoin.

protected SparkCollection<Object> handleJoin(Map<String, SparkCollection<Object>> inputDataCollections, PipelinePhase pipelinePhase, PluginFunctionContext pluginFunctionContext, StageSpec stageSpec, FunctionCache.Factory functionCacheFactory, Object plugin, Integer numPartitions, StageStatisticsCollector collector, Set<String> shufflers) throws Exception {
    String stageName = stageSpec.getName();
    if (plugin instanceof BatchJoiner) {
        BatchJoiner<Object, Object, Object> joiner = (BatchJoiner<Object, Object, Object>) plugin;
        BatchJoinerRuntimeContext joinerRuntimeContext = pluginFunctionContext.createBatchRuntimeContext();
        joiner.initialize(joinerRuntimeContext);
        shufflers.add(stageName);
        return handleJoin(joiner, inputDataCollections, stageSpec, functionCacheFactory, numPartitions, collector);
    } else if (plugin instanceof AutoJoiner) {
        AutoJoiner autoJoiner = (AutoJoiner) plugin;
        Map<String, Schema> inputSchemas = new HashMap<>();
        for (String inputStageName : pipelinePhase.getStageInputs(stageName)) {
            StageSpec inputStageSpec = pipelinePhase.getStage(inputStageName);
            Port outputPort = inputStageSpec.getOutputPorts().get(stageName);
            if (outputPort == null) {
                inputSchemas.put(inputStageName, null);
            } else {
                inputSchemas.put(inputStageName, outputPort.getSchema());
            }
        }
        FailureCollector failureCollector = new LoggingFailureCollector(stageName, inputSchemas);
        AutoJoinerContext autoJoinerContext = DefaultAutoJoinerContext.from(inputSchemas, failureCollector);
        // joinDefinition will always be non-null because
        // it is checked by PipelinePhasePreparer at the start of the run.
        JoinDefinition joinDefinition = autoJoiner.define(autoJoinerContext);
        failureCollector.getOrThrowException();
        if (joinDefinition.getStages().stream().noneMatch(JoinStage::isBroadcast)) {
            shufflers.add(stageName);
        }
        return handleAutoJoin(stageName, joinDefinition, inputDataCollections, numPartitions);
    } else {
        // should never happen unless there is a bug in the code. should have failed during deployment
        throw new IllegalStateException(String.format("Stage '%s' is an unknown joiner type %s", stageName, plugin.getClass().getName()));
    }
}
Also used : BatchJoinerRuntimeContext(io.cdap.cdap.etl.api.batch.BatchJoinerRuntimeContext) LoggingFailureCollector(io.cdap.cdap.etl.validation.LoggingFailureCollector) Port(io.cdap.cdap.etl.proto.v2.spec.StageSpec.Port) BatchJoiner(io.cdap.cdap.etl.api.batch.BatchJoiner) DefaultAutoJoinerContext(io.cdap.cdap.etl.common.DefaultAutoJoinerContext) AutoJoinerContext(io.cdap.cdap.etl.api.join.AutoJoinerContext) JoinDefinition(io.cdap.cdap.etl.api.join.JoinDefinition) StageSpec(io.cdap.cdap.etl.proto.v2.spec.StageSpec) AutoJoiner(io.cdap.cdap.etl.api.join.AutoJoiner) Map(java.util.Map) HashMap(java.util.HashMap) LoggingFailureCollector(io.cdap.cdap.etl.validation.LoggingFailureCollector) FailureCollector(io.cdap.cdap.etl.api.FailureCollector)

Example 22 with StageSpec

use of io.cdap.cdap.etl.proto.v2.spec.StageSpec in project cdap by caskdata.

the class PipelinePlannerTest method testConditionsOnBranches.

@Test
public void testConditionsOnBranches() {
    /*
                        |-- true --> n2
              |--> c1 --|
         n1 --|         |-- false --> n3
              |
              |                |-- true --> n5
              |--> n4 --> c2 --|
                               |-- false --> n6
     */
    Set<StageSpec> stageSpecs = ImmutableSet.of(StageSpec.builder("c1", CONDITION).build(), StageSpec.builder("c2", CONDITION).build(), StageSpec.builder("n1", NODE).build(), StageSpec.builder("n2", NODE).build(), StageSpec.builder("n3", NODE).build(), StageSpec.builder("n4", NODE).build(), StageSpec.builder("n5", NODE).build(), StageSpec.builder("n6", NODE).build());
    Set<Connection> connections = ImmutableSet.of(new Connection("n1", "c1"), new Connection("n1", "n4"), new Connection("c1", "n2", true), new Connection("c1", "n3", false), new Connection("n4", "c2"), new Connection("c2", "n5", true), new Connection("c2", "n6", false));
    Set<String> pluginTypes = ImmutableSet.of(NODE.getType(), Constants.Connector.PLUGIN_TYPE, CONDITION.getType());
    Set<String> reduceTypes = ImmutableSet.of();
    PipelinePlanner planner = new PipelinePlanner(pluginTypes, reduceTypes, Collections.<String>emptySet(), Collections.<String>emptySet(), Collections.<String>emptySet());
    PipelineSpec pipelineSpec = PipelineSpec.builder().addStages(stageSpecs).addConnections(connections).build();
    Map<String, PipelinePhase> phases = new HashMap<>();
    Set<Connection> phaseConnections = new HashSet<>();
    for (String condition : ImmutableList.of("c1", "c2")) {
        phases.put(condition, PipelinePhase.builder(pluginTypes).addStage(StageSpec.builder(condition, CONDITION).build()).build());
    }
    PipelinePhase phase = PipelinePhase.builder(pluginTypes).addStage(StageSpec.builder("n1", NODE).build()).addStage(StageSpec.builder("n4", NODE).build()).addStage(StageSpec.builder("c1.connector", connectorSpec("c1.connector", Constants.Connector.SINK_TYPE)).build()).addStage(StageSpec.builder("c2.connector", connectorSpec("c2.connector", Constants.Connector.SINK_TYPE)).build()).addConnection("n1", "n4").addConnection("n1", "c1.connector").addConnection("n4", "c2.connector").build();
    Dag nonConnectorDag = new Dag(ImmutableSet.of(new Connection("n1", "n4"), new Connection("n1", "c1"), new Connection("n4", "c2")));
    String phaseName = PipelinePlanner.getPhaseName(nonConnectorDag);
    phases.put(phaseName, phase);
    // [n1, n4, c1, c2] -> [c1]
    phaseConnections.add(new Connection(phaseName, "c1"));
    // [n1, n4, c1, c2] -> [c2]
    phaseConnections.add(new Connection(phaseName, "c2"));
    // [c1] -- true --> [c1 -> n2]
    phase = PipelinePhase.builder(pluginTypes).addStage(StageSpec.builder("c1.connector", connectorSpec("c1.connector", Connector.SOURCE_TYPE)).build()).addStage(StageSpec.builder("n2", NODE).build()).addConnection("c1.connector", "n2").build();
    nonConnectorDag = new Dag(ImmutableSet.of(new Connection("c1", "n2")));
    phaseName = PipelinePlanner.getPhaseName(nonConnectorDag);
    phases.put(phaseName, phase);
    phaseConnections.add(new Connection("c1", phaseName, true));
    // [c1] -- false --> [c1 -> n3]
    phase = PipelinePhase.builder(pluginTypes).addStage(StageSpec.builder("c1.connector", connectorSpec("c1.connector", Connector.SOURCE_TYPE)).build()).addStage(StageSpec.builder("n3", NODE).build()).addConnection("c1.connector", "n3").build();
    nonConnectorDag = new Dag(ImmutableSet.of(new Connection("c1", "n3")));
    phaseName = PipelinePlanner.getPhaseName(nonConnectorDag);
    phases.put(phaseName, phase);
    phaseConnections.add(new Connection("c1", phaseName, false));
    // [c2] -- true --> [c2 -> n5]
    phase = PipelinePhase.builder(pluginTypes).addStage(StageSpec.builder("c2.connector", connectorSpec("c2.connector", Connector.SOURCE_TYPE)).build()).addStage(StageSpec.builder("n5", NODE).build()).addConnection("c2.connector", "n5").build();
    nonConnectorDag = new Dag(ImmutableSet.of(new Connection("c2", "n5")));
    phaseName = PipelinePlanner.getPhaseName(nonConnectorDag);
    phases.put(phaseName, phase);
    phaseConnections.add(new Connection("c2", phaseName, true));
    // [c2] -- false --> [c2 -> n6]
    phase = PipelinePhase.builder(pluginTypes).addStage(StageSpec.builder("c2.connector", connectorSpec("c2.connector", Connector.SOURCE_TYPE)).build()).addStage(StageSpec.builder("n6", NODE).build()).addConnection("c2.connector", "n6").build();
    nonConnectorDag = new Dag(ImmutableSet.of(new Connection("c2", "n6")));
    phaseName = PipelinePlanner.getPhaseName(nonConnectorDag);
    phases.put(phaseName, phase);
    phaseConnections.add(new Connection("c2", phaseName, false));
    PipelinePlan expected = new PipelinePlan(phases, phaseConnections);
    PipelinePlan actual = planner.plan(pipelineSpec);
    Assert.assertEquals(expected, actual);
}
Also used : HashMap(java.util.HashMap) Connection(io.cdap.cdap.etl.proto.Connection) PipelineSpec(io.cdap.cdap.etl.proto.v2.spec.PipelineSpec) PipelinePhase(io.cdap.cdap.etl.common.PipelinePhase) StageSpec(io.cdap.cdap.etl.proto.v2.spec.StageSpec) HashSet(java.util.HashSet) Test(org.junit.Test)

Example 23 with StageSpec

use of io.cdap.cdap.etl.proto.v2.spec.StageSpec in project cdap by caskdata.

the class PipelinePlannerTest method testConditionsToConditions.

@Test
public void testConditionsToConditions() {
    /*
      n1 - c1----c2---n2
           |
           |-----c3---n3
     */
    Set<StageSpec> stageSpecs = ImmutableSet.of(StageSpec.builder("n1", NODE).build(), StageSpec.builder("n2", NODE).build(), StageSpec.builder("condition1", CONDITION1).build(), StageSpec.builder("n3", NODE).build(), StageSpec.builder("condition2", CONDITION2).build(), StageSpec.builder("condition3", CONDITION3).build());
    Set<Connection> connections = ImmutableSet.of(new Connection("n1", "condition1"), new Connection("condition1", "condition2", true), new Connection("condition1", "condition3", false), new Connection("condition2", "n2", true), new Connection("condition3", "n3", true));
    Set<String> pluginTypes = ImmutableSet.of(NODE.getType(), REDUCE.getType(), Constants.Connector.PLUGIN_TYPE, CONDITION1.getType(), CONDITION2.getType(), CONDITION3.getType(), CONDITION4.getType(), CONDITION5.getType());
    Set<String> reduceTypes = ImmutableSet.of(REDUCE.getType());
    Set<String> emptySet = ImmutableSet.of();
    PipelinePlanner planner = new PipelinePlanner(pluginTypes, reduceTypes, emptySet, emptySet, emptySet);
    PipelineSpec pipelineSpec = PipelineSpec.builder().addStages(stageSpecs).addConnections(connections).build();
    Map<String, PipelinePhase> phases = new HashMap<>();
    /*
      n1--condition1.connector
     */
    PipelinePhase phase1 = PipelinePhase.builder(pluginTypes).addStage(StageSpec.builder("n1", NODE).build()).addStage(StageSpec.builder("condition1.connector", connectorSpec("condition1.connector", Constants.Connector.SINK_TYPE)).build()).addConnection("n1", "condition1.connector").build();
    Dag controlPhaseDag = new Dag(ImmutableSet.of(new Connection("n1", "condition1")));
    String phase1Name = PipelinePlanner.getPhaseName(controlPhaseDag);
    phases.put(phase1Name, phase1);
    /*
      condition1
     */
    PipelinePhase phase2 = PipelinePhase.builder(pluginTypes).addStage(StageSpec.builder("condition1", CONDITION1).build()).build();
    String phase2Name = "condition1";
    phases.put(phase2Name, phase2);
    /*
      condition2
     */
    PipelinePhase phase3 = PipelinePhase.builder(pluginTypes).addStage(StageSpec.builder("condition2", CONDITION2).build()).build();
    String phase3Name = "condition2";
    phases.put(phase3Name, phase3);
    /*
      condition3
     */
    PipelinePhase phase4 = PipelinePhase.builder(pluginTypes).addStage(StageSpec.builder("condition3", CONDITION3).build()).build();
    String phase4Name = "condition3";
    phases.put(phase4Name, phase4);
    /*
      condition1.connector -- n2
     */
    PipelinePhase phase5 = PipelinePhase.builder(pluginTypes).addStage(StageSpec.builder("condition1.connector", connectorSpec("condition1.connector", Constants.Connector.SOURCE_TYPE)).build()).addStage(StageSpec.builder("n2", NODE).build()).addConnection("condition1.connector", "n2").build();
    controlPhaseDag = new Dag(ImmutableSet.of(new Connection("condition2", "n2")));
    String phase5Name = PipelinePlanner.getPhaseName(controlPhaseDag);
    phases.put(phase5Name, phase5);
    /*
      condition1.connector -- n3
     */
    PipelinePhase phase6 = PipelinePhase.builder(pluginTypes).addStage(StageSpec.builder("condition1.connector", connectorSpec("condition1.connector", Constants.Connector.SOURCE_TYPE)).build()).addStage(StageSpec.builder("n3", NODE).build()).addConnection("condition1.connector", "n3").build();
    controlPhaseDag = new Dag(ImmutableSet.of(new Connection("condition3", "n3")));
    String phase6Name = PipelinePlanner.getPhaseName(controlPhaseDag);
    phases.put(phase6Name, phase6);
    Set<Connection> phaseConnections = new HashSet<>();
    phaseConnections.add(new Connection(phase1Name, phase2Name));
    phaseConnections.add(new Connection(phase2Name, phase3Name, true));
    phaseConnections.add(new Connection(phase2Name, phase4Name, false));
    phaseConnections.add(new Connection(phase3Name, phase5Name, true));
    phaseConnections.add(new Connection(phase4Name, phase6Name, true));
    PipelinePlan expected = new PipelinePlan(phases, phaseConnections);
    PipelinePlan actual = planner.plan(pipelineSpec);
    Assert.assertEquals(expected, actual);
}
Also used : HashMap(java.util.HashMap) Connection(io.cdap.cdap.etl.proto.Connection) PipelineSpec(io.cdap.cdap.etl.proto.v2.spec.PipelineSpec) PipelinePhase(io.cdap.cdap.etl.common.PipelinePhase) StageSpec(io.cdap.cdap.etl.proto.v2.spec.StageSpec) HashSet(java.util.HashSet) Test(org.junit.Test)

Example 24 with StageSpec

use of io.cdap.cdap.etl.proto.v2.spec.StageSpec in project cdap by caskdata.

the class PipelinePlannerTest method testSimpleCondition.

@Test
public void testSimpleCondition() {
    /*
      n1 - n2 - condition - n3
                      |
                      |---- n4
     */
    Set<StageSpec> stageSpecs = ImmutableSet.of(StageSpec.builder("n1", NODE).build(), StageSpec.builder("n2", NODE).build(), StageSpec.builder("condition", CONDITION).build(), StageSpec.builder("n3", NODE).build(), StageSpec.builder("n4", NODE).build());
    Set<Connection> connections = ImmutableSet.of(new Connection("n1", "n2"), new Connection("n2", "condition"), new Connection("condition", "n3", true), new Connection("condition", "n4", false));
    Set<String> pluginTypes = ImmutableSet.of(NODE.getType(), REDUCE.getType(), Constants.Connector.PLUGIN_TYPE, CONDITION.getType());
    Set<String> reduceTypes = ImmutableSet.of(REDUCE.getType());
    Set<String> emptySet = ImmutableSet.of();
    PipelinePlanner planner = new PipelinePlanner(pluginTypes, reduceTypes, emptySet, emptySet, emptySet);
    PipelineSpec pipelineSpec = PipelineSpec.builder().addStages(stageSpecs).addConnections(connections).build();
    Map<String, PipelinePhase> phases = new HashMap<>();
    /*
      n1--n2--condition.connector
     */
    PipelinePhase phase1 = PipelinePhase.builder(pluginTypes).addStage(StageSpec.builder("n1", NODE).build()).addStage(StageSpec.builder("n2", NODE).build()).addStage(StageSpec.builder("condition.connector", connectorSpec("condition.connector", Constants.Connector.SINK_TYPE)).build()).addConnection("n1", "n2").addConnection("n2", "condition.connector").build();
    Dag controlPhaseDag = new Dag(ImmutableSet.of(new Connection("n1", "n2"), new Connection("n2", "condition")));
    String phase1Name = PipelinePlanner.getPhaseName(controlPhaseDag);
    phases.put(phase1Name, phase1);
    /*
      condition
     */
    PipelinePhase phase2 = PipelinePhase.builder(pluginTypes).addStage(StageSpec.builder("condition", CONDITION).build()).build();
    String phase2Name = "condition";
    phases.put(phase2Name, phase2);
    /*
      condition.connector -- n3
     */
    PipelinePhase phase3 = PipelinePhase.builder(pluginTypes).addStage(StageSpec.builder("condition.connector", connectorSpec("condition.connector", Constants.Connector.SOURCE_TYPE)).build()).addStage(StageSpec.builder("n3", NODE).build()).addConnection("condition.connector", "n3").build();
    controlPhaseDag = new Dag(ImmutableSet.of(new Connection("condition", "n3")));
    String phase3Name = PipelinePlanner.getPhaseName(controlPhaseDag);
    phases.put(phase3Name, phase3);
    /*
      condition.connector -- n4
     */
    PipelinePhase phase4 = PipelinePhase.builder(pluginTypes).addStage(StageSpec.builder("condition.connector", connectorSpec("condition.connector", Constants.Connector.SOURCE_TYPE)).build()).addStage(StageSpec.builder("n4", NODE).build()).addConnection("condition.connector", "n4").build();
    controlPhaseDag = new Dag(ImmutableSet.of(new Connection("condition", "n4")));
    String phase4Name = PipelinePlanner.getPhaseName(controlPhaseDag);
    phases.put(phase4Name, phase4);
    Set<Connection> phaseConnections = new HashSet<>();
    phaseConnections.add(new Connection(phase1Name, phase2Name));
    phaseConnections.add(new Connection(phase2Name, phase3Name, true));
    phaseConnections.add(new Connection(phase2Name, phase4Name, false));
    PipelinePlan expected = new PipelinePlan(phases, phaseConnections);
    PipelinePlan actual = planner.plan(pipelineSpec);
    Assert.assertEquals(expected, actual);
}
Also used : HashMap(java.util.HashMap) Connection(io.cdap.cdap.etl.proto.Connection) PipelineSpec(io.cdap.cdap.etl.proto.v2.spec.PipelineSpec) PipelinePhase(io.cdap.cdap.etl.common.PipelinePhase) StageSpec(io.cdap.cdap.etl.proto.v2.spec.StageSpec) HashSet(java.util.HashSet) Test(org.junit.Test)

Example 25 with StageSpec

use of io.cdap.cdap.etl.proto.v2.spec.StageSpec in project cdap by cdapio.

the class TransformExecutorFactory method getPipeStage.

private PipeStage getPipeStage(PipelinePhase pipeline, String stageName, Map<String, PipeStage> pipeStages) throws Exception {
    StageSpec stageSpec = pipeline.getStage(stageName);
    String pluginType = stageSpec.getPluginType();
    // handle ending stage case, which don't use PipeEmitter
    if (pipeline.getSinks().contains(stageName)) {
        return getSinkPipeStage(stageSpec);
    }
    // create PipeEmitter, which holds all output PipeStages it needs to write to and wraps any output it gets
    // into a RecordInfo
    // ConnectorSources require a special emitter since they need to build RecordInfo from the temporary dataset
    PipeEmitter.Builder emitterBuilder = Constants.Connector.PLUGIN_TYPE.equals(pluginType) && pipeline.getSources().contains(stageName) ? ConnectorSourceEmitter.builder(stageName) : PipeEmitter.builder(stageName);
    Map<String, StageSpec.Port> outputPorts = stageSpec.getOutputPorts();
    for (String outputStageName : pipeline.getStageOutputs(stageName)) {
        StageSpec outputStageSpec = pipeline.getStage(outputStageName);
        String outputStageType = outputStageSpec.getPluginType();
        PipeStage outputPipeStage = pipeStages.get(outputStageName);
        if (ErrorTransform.PLUGIN_TYPE.equals(outputStageType)) {
            emitterBuilder.addErrorConsumer(outputPipeStage);
        } else if (AlertPublisher.PLUGIN_TYPE.equals(outputStageType)) {
            emitterBuilder.addAlertConsumer(outputPipeStage);
        } else if (Constants.Connector.PLUGIN_TYPE.equals(pluginType)) {
            // connectors only have a single output
            emitterBuilder.addOutputConsumer(outputPipeStage);
        } else {
            // if the output is a connector like agg5.connector, the outputPorts will contain the original 'agg5' as
            // a key, but not 'agg5.connector' so we need to lookup the original stage from the connector's plugin spec
            String originalOutputName = Constants.Connector.PLUGIN_TYPE.equals(outputStageType) ? outputStageSpec.getPlugin().getProperties().get(Constants.Connector.ORIGINAL_NAME) : outputStageName;
            String port = outputPorts.containsKey(originalOutputName) ? outputPorts.get(originalOutputName).getPort() : null;
            if (port != null) {
                emitterBuilder.addOutputConsumer(outputPipeStage, port);
            } else {
                emitterBuilder.addOutputConsumer(outputPipeStage);
            }
        }
    }
    PipeEmitter pipeEmitter = emitterBuilder.build();
    if (SplitterTransform.PLUGIN_TYPE.equals(pluginType)) {
        // this is a SplitterTransform, needs to emit records to the right outputs based on port
        return new MultiOutputTransformPipeStage<>(stageName, getMultiOutputTransform(stageSpec), pipeEmitter);
    } else {
        return new UnwrapPipeStage<>(stageName, getTransformation(stageSpec), pipeEmitter);
    }
}
Also used : StageSpec(io.cdap.cdap.etl.proto.v2.spec.StageSpec)

Aggregations

StageSpec (io.cdap.cdap.etl.proto.v2.spec.StageSpec)74 HashMap (java.util.HashMap)42 PipelinePhase (io.cdap.cdap.etl.common.PipelinePhase)30 HashSet (java.util.HashSet)24 Map (java.util.Map)24 MacroEvaluator (io.cdap.cdap.api.macro.MacroEvaluator)20 DefaultMacroEvaluator (io.cdap.cdap.etl.common.DefaultMacroEvaluator)20 Connection (io.cdap.cdap.etl.proto.Connection)18 Schema (io.cdap.cdap.api.data.schema.Schema)16 PipelineRuntime (io.cdap.cdap.etl.common.PipelineRuntime)16 ArrayList (java.util.ArrayList)16 BatchPhaseSpec (io.cdap.cdap.etl.batch.BatchPhaseSpec)14 PipelineSpec (io.cdap.cdap.etl.proto.v2.spec.PipelineSpec)14 Test (org.junit.Test)14 PipelinePluginContext (io.cdap.cdap.etl.common.plugin.PipelinePluginContext)12 PluginContext (io.cdap.cdap.api.plugin.PluginContext)10 BasicArguments (io.cdap.cdap.etl.common.BasicArguments)10 List (java.util.List)10 WorkflowToken (io.cdap.cdap.api.workflow.WorkflowToken)8 BatchJoiner (io.cdap.cdap.etl.api.batch.BatchJoiner)8