Search in sources :

Example 71 with ETLConfig

use of io.cdap.cdap.etl.proto.v2.ETLConfig in project cdap by caskdata.

the class DataStreamsSparkSinkTest method testSparkSink.

@Test
public // stream-rate-updater thread in Spark.
void testSparkSink() throws Exception {
    Schema schema = Schema.recordOf("test", Schema.Field.of("id", Schema.of(Schema.Type.STRING)), Schema.Field.of("name", Schema.of(Schema.Type.STRING)));
    List<StructuredRecord> input = new ArrayList<>();
    StructuredRecord samuelRecord = StructuredRecord.builder(schema).set("id", "0").set("name", "samuel").build();
    StructuredRecord jacksonRecord = StructuredRecord.builder(schema).set("id", "1").set("name", "jackson").build();
    StructuredRecord dwayneRecord = StructuredRecord.builder(schema).set("id", "2").set("name", "dwayne").build();
    StructuredRecord johnsonRecord = StructuredRecord.builder(schema).set("id", "3").set("name", "johnson").build();
    input.add(samuelRecord);
    input.add(jacksonRecord);
    input.add(dwayneRecord);
    input.add(johnsonRecord);
    DataStreamsConfig etlConfig = DataStreamsConfig.builder().addStage(new ETLStage("source", MockSource.getPlugin(schema, input))).addStage(new ETLStage("sink", io.cdap.cdap.etl.mock.spark.streaming.MockSink.getPlugin("${tablename}"))).addConnection("source", "sink").setCheckpointDir("file://" + TMP_FOLDER.getRoot().toPath().toString()).setBatchInterval("1s").build();
    ApplicationId appId = NamespaceId.DEFAULT.app("sparksinkapp");
    AppRequest<DataStreamsConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
    ApplicationManager appManager = deployApplication(appId, appRequest);
    testSparkSink(appManager, "output1");
    testSparkSink(appManager, "output2");
}
Also used : ApplicationManager(io.cdap.cdap.test.ApplicationManager) ETLStage(io.cdap.cdap.etl.proto.v2.ETLStage) Schema(io.cdap.cdap.api.data.schema.Schema) ArrayList(java.util.ArrayList) ApplicationId(io.cdap.cdap.proto.id.ApplicationId) StructuredRecord(io.cdap.cdap.api.data.format.StructuredRecord) DataStreamsConfig(io.cdap.cdap.etl.proto.v2.DataStreamsConfig) AppRequest(io.cdap.cdap.proto.artifact.AppRequest) Test(org.junit.Test)

Example 72 with ETLConfig

use of io.cdap.cdap.etl.proto.v2.ETLConfig in project cdap by caskdata.

the class PreviewDataStreamsTest method testDataStreamsPreviewRun.

@Test
public void testDataStreamsPreviewRun() throws Exception {
    PreviewManager previewManager = getPreviewManager();
    String sinkTableName = "singleOutput";
    Schema schema = Schema.recordOf("testRecord", Schema.Field.of("name", Schema.of(Schema.Type.STRING)));
    List<StructuredRecord> records = new ArrayList<>();
    StructuredRecord recordSamuel = StructuredRecord.builder(schema).set("name", "samuel").build();
    StructuredRecord recordBob = StructuredRecord.builder(schema).set("name", "bob").build();
    StructuredRecord recordTest = StructuredRecord.builder(schema).set("name", "test").build();
    records.add(recordSamuel);
    records.add(recordBob);
    records.add(recordTest);
    /*
     * source --> transform -> sink
     */
    DataStreamsConfig etlConfig = DataStreamsConfig.builder().addStage(new ETLStage("source", MockSource.getPlugin(schema, records))).addStage(new ETLStage("transform", IdentityTransform.getPlugin())).addStage(new ETLStage("sink", MockSink.getPlugin(sinkTableName))).addConnection("source", "transform").addConnection("transform", "sink").setNumOfRecordsPreview(100).setBatchInterval("1s").setCheckpointDir("file://" + TMP_FOLDER.getRoot().toPath().toString()).build();
    // Construct the preview config with the program name and program type.
    PreviewConfig previewConfig = new PreviewConfig(DataStreamsSparkLauncher.NAME, ProgramType.SPARK, Collections.<String, String>emptyMap(), 1);
    AppRequest<DataStreamsConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig, previewConfig);
    // Start the preview and get the corresponding PreviewRunner.
    ApplicationId previewId = previewManager.start(NamespaceId.DEFAULT, appRequest);
    // Wait for the preview to be running and wait until the records are processed in the sink.
    Tasks.waitFor(true, new Callable<Boolean>() {

        @Override
        public Boolean call() throws Exception {
            Map<String, List<JsonElement>> data = previewManager.getData(previewId, "sink");
            return data != null && data.get(DATA_TRACER_PROPERTY) != null && data.get(DATA_TRACER_PROPERTY).size() == 3;
        }
    }, 1, TimeUnit.MINUTES);
    // check data in source and transform
    checkPreviewStore(previewManager, previewId, "source", 3);
    checkPreviewStore(previewManager, previewId, "transform", 3);
    // Wait for the pipeline to be shutdown by timer.
    TimeUnit.MINUTES.sleep(1);
    Tasks.waitFor(PreviewStatus.Status.KILLED_BY_TIMER, new Callable<PreviewStatus.Status>() {

        @Override
        public PreviewStatus.Status call() throws Exception {
            return previewManager.getStatus(previewId).getStatus();
        }
    }, 1, TimeUnit.MINUTES);
    // Validate the metrics for preview
    validateMetric(3, previewId, "source.records.out", previewManager);
    validateMetric(3, previewId, "transform.records.in", previewManager);
    validateMetric(3, previewId, "transform.records.out", previewManager);
    validateMetric(3, previewId, "sink.records.in", previewManager);
    validateMetric(3, previewId, "sink.records.out", previewManager);
    // Check the sink table is not created in the real space.
    DataSetManager<Table> sinkManager = getDataset(sinkTableName);
    Assert.assertNull(sinkManager.get());
}
Also used : PreviewStatus(io.cdap.cdap.app.preview.PreviewStatus) Table(io.cdap.cdap.api.dataset.table.Table) Schema(io.cdap.cdap.api.data.schema.Schema) ArrayList(java.util.ArrayList) StructuredRecord(io.cdap.cdap.api.data.format.StructuredRecord) TimeoutException(java.util.concurrent.TimeoutException) NotFoundException(io.cdap.cdap.common.NotFoundException) DataStreamsConfig(io.cdap.cdap.etl.proto.v2.DataStreamsConfig) AppRequest(io.cdap.cdap.proto.artifact.AppRequest) PreviewManager(io.cdap.cdap.app.preview.PreviewManager) ETLStage(io.cdap.cdap.etl.proto.v2.ETLStage) JsonElement(com.google.gson.JsonElement) ApplicationId(io.cdap.cdap.proto.id.ApplicationId) Map(java.util.Map) ImmutableMap(com.google.common.collect.ImmutableMap) PreviewConfig(io.cdap.cdap.proto.artifact.preview.PreviewConfig) Test(org.junit.Test)

Example 73 with ETLConfig

use of io.cdap.cdap.etl.proto.v2.ETLConfig in project cdap by caskdata.

the class PipelineSpecGenerator method validateConfig.

/**
 * Validate that this is a valid pipeline. A valid pipeline has the following properties:
 *
 * All stages in the pipeline have a unique name.
 * Source stages have at least one output and no inputs.
 * Sink stages have at least one input and no outputs.
 * There are no cycles in the pipeline.
 * All inputs into a stage have the same schema.
 * ErrorTransforms only have BatchSource, Transform, or BatchAggregator as input stages.
 * AlertPublishers have at least one input and no outputs and don't have SparkSink or BatchSink as input.
 * Action stages can only be at the start or end of the pipeline.
 * Condition stages have at most 2 outputs. Each stage on a condition's output branch has at most a single input.
 *
 * Returns the stages in the order they should be configured to ensure that all input stages are configured
 * before their output.
 *
 * @param config the user provided configuration
 * @return the order to configure the stages in
 * @throws IllegalArgumentException if the pipeline is invalid
 */
protected ValidatedPipeline validateConfig(ETLConfig config) {
    config.validate();
    if (config.getStages().isEmpty()) {
        throw new IllegalArgumentException("A pipeline must contain at least one stage.");
    }
    Set<String> actionStages = new HashSet<>();
    Set<String> conditionStages = new HashSet<>();
    Map<String, String> stageTypes = new HashMap<>();
    // check stage name uniqueness
    Set<String> stageNames = new HashSet<>();
    for (ETLStage stage : config.getStages()) {
        if (!stageNames.add(stage.getName())) {
            throw new IllegalArgumentException(String.format("Invalid pipeline. Multiple stages are named %s. Please ensure all stage names are unique", stage.getName()));
        }
        // if stage is Action stage, add it to the Action stage set
        if (isAction(stage.getPlugin().getType())) {
            actionStages.add(stage.getName());
        }
        // if the stage is condition add it to the Condition stage set
        if (stage.getPlugin().getType().equals(Condition.PLUGIN_TYPE)) {
            conditionStages.add(stage.getName());
        }
        stageTypes.put(stage.getName(), stage.getPlugin().getType());
    }
    // check that the from and to are names of actual stages
    // also check that conditions have at most 2 outgoing connections each label with true or
    // false but not both
    Map<String, Boolean> conditionBranch = new HashMap<>();
    for (Connection connection : config.getConnections()) {
        if (!stageNames.contains(connection.getFrom())) {
            throw new IllegalArgumentException(String.format("Invalid connection %s. %s is not a stage.", connection, connection.getFrom()));
        }
        if (!stageNames.contains(connection.getTo())) {
            throw new IllegalArgumentException(String.format("Invalid connection %s. %s is not a stage.", connection, connection.getTo()));
        }
        if (conditionStages.contains(connection.getFrom())) {
            if (connection.getCondition() == null) {
                String msg = String.format("For condition stage %s, the connection %s is not marked with either " + "'true' or 'false'.", connection.getFrom(), connection);
                throw new IllegalArgumentException(msg);
            }
            // check if connection from the condition node is marked as true or false multiple times
            if (conditionBranch.containsKey(connection.getFrom()) && connection.getCondition().equals(conditionBranch.get(connection.getFrom()))) {
                String msg = String.format("For condition stage '%s', more than one outgoing connections are marked as %s.", connection.getFrom(), connection.getCondition());
                throw new IllegalArgumentException(msg);
            }
            conditionBranch.put(connection.getFrom(), connection.getCondition());
        }
    }
    List<ETLStage> traversalOrder = new ArrayList<>(stageNames.size());
    // can only have empty connections if the pipeline consists of a single action.
    if (config.getConnections().isEmpty()) {
        if (actionStages.size() == 1 && stageNames.size() == 1) {
            traversalOrder.add(config.getStages().iterator().next());
            return new ValidatedPipeline(traversalOrder, config);
        } else {
            throw new IllegalArgumentException("Invalid pipeline. There are no connections between stages. " + "This is only allowed if the pipeline consists of a single action plugin.");
        }
    }
    Dag dag = new Dag(config.getConnections());
    Set<String> controlStages = Sets.union(actionStages, conditionStages);
    Map<String, ETLStage> stages = new HashMap<>();
    for (ETLStage stage : config.getStages()) {
        String stageName = stage.getName();
        Set<String> stageInputs = dag.getNodeInputs(stageName);
        Set<String> stageOutputs = dag.getNodeOutputs(stageName);
        String stageType = stage.getPlugin().getType();
        boolean isSource = isSource(stageType);
        boolean isSink = isSink(stageType);
        // check source plugins are sources in the dag
        if (isSource) {
            if (!stageInputs.isEmpty() && !controlStages.containsAll(stageInputs)) {
                throw new IllegalArgumentException(String.format("%s %s has incoming connections from %s. %s stages cannot have any incoming connections.", stageType, stageName, Joiner.on(',').join(stageInputs), stageType));
            }
            // check that source plugins are not present after any non-condition/action stage
            Set<String> parents = dag.parentsOf(stageName);
            Set<String> nonControlParents = Sets.difference(parents, controlStages);
            if (nonControlParents.size() > 1) {
                // the stage's nonControlParents should only contain itself
                throw new IllegalArgumentException(String.format("%s %s is invalid. %s stages can only be placed at the start of the pipeline.", stageType, stageName, stageType));
            }
        } else if (isSink) {
            if (!stageOutputs.isEmpty() && !controlStages.containsAll(stageOutputs)) {
                throw new IllegalArgumentException(String.format("%s %s has outgoing connections to %s. %s stages cannot have any outgoing connections.", stageType, stageName, Joiner.on(',').join(stageOutputs), stageType));
            }
        } else if (ErrorTransform.PLUGIN_TYPE.equals(stageType)) {
            for (String inputStage : stageInputs) {
                String inputType = stageTypes.get(inputStage);
                if (!VALID_ERROR_INPUTS.contains(inputType)) {
                    throw new IllegalArgumentException(String.format("ErrorTransform %s cannot have stage %s of type %s as input. Only %s stages can emit errors.", stageName, inputStage, inputType, Joiner.on(',').join(VALID_ERROR_INPUTS)));
                }
            }
        }
        boolean isAction = isAction(stageType);
        if (!isAction && !stageType.equals(Condition.PLUGIN_TYPE) && !isSource && stageInputs.isEmpty()) {
            throw new IllegalArgumentException(String.format("Stage %s is unreachable, it has no incoming connections.", stageName));
        }
        if (!isAction && !isSink && stageOutputs.isEmpty()) {
            throw new IllegalArgumentException(String.format("Stage %s is a dead end, it has no outgoing connections.", stageName));
        }
        stages.put(stageName, stage);
    }
    // make sure actions are not in the middle of the pipeline -- only at the start and/or end
    for (String actionStage : actionStages) {
        Set<String> actionParents = dag.parentsOf(actionStage);
        Set<String> actionChildren = dag.accessibleFrom(actionStage);
        Set<String> nonControlParents = Sets.difference(actionParents, controlStages);
        Set<String> nonControlChildren = Sets.difference(actionChildren, controlStages);
        if (!nonControlChildren.isEmpty() && !nonControlParents.isEmpty()) {
            throw new IllegalArgumentException(String.format("Action stage '%s' is invalid. Actions can only be placed at the start or end of the pipeline.", actionStage));
        }
    }
    validateConditionBranches(conditionStages, dag);
    for (String stageName : dag.getTopologicalOrder()) {
        traversalOrder.add(stages.get(stageName));
    }
    return new ValidatedPipeline(traversalOrder, config);
}
Also used : HashMap(java.util.HashMap) Connection(io.cdap.cdap.etl.proto.Connection) ArrayList(java.util.ArrayList) Dag(io.cdap.cdap.etl.planner.Dag) ETLStage(io.cdap.cdap.etl.proto.v2.ETLStage) HashSet(java.util.HashSet)

Example 74 with ETLConfig

use of io.cdap.cdap.etl.proto.v2.ETLConfig in project cdap by caskdata.

the class PipelineSpecGeneratorTest method testGenerateSpec.

@Test
public void testGenerateSpec() throws ValidationException {
    /*
     *           ---- t1 ------------
     *           |            |      |
     * source ---             |      |--- t3 --- sink1
     *           |            |      |
     *           ------------ t2 --------------- sink2
     *           |                        |
     *           |                        |
     *           -------------------------
     */
    ETLBatchConfig etlConfig = ETLBatchConfig.builder().setTimeSchedule("* * * * *").addStage(new ETLStage("source", MOCK_SOURCE)).addStage(new ETLStage("sink1", MOCK_SINK)).addStage(new ETLStage("sink2", MOCK_SINK)).addStage(new ETLStage("t1", MOCK_TRANSFORM_A)).addStage(new ETLStage("t2", MOCK_TRANSFORM_A)).addStage(new ETLStage("t3", MOCK_TRANSFORM_B)).addConnection("source", "t1").addConnection("source", "t2").addConnection("source", "sink2").addConnection("t1", "t2").addConnection("t1", "t3").addConnection("t1", "sink2").addConnection("t2", "sink2").addConnection("t2", "t3").addConnection("t3", "sink1").setNumOfRecordsPreview(100).build();
    // test the spec generated is correct, with the right input and output schemas and artifact information.
    BatchPipelineSpec actual = specGenerator.generateSpec(etlConfig);
    Map<String, String> emptyMap = ImmutableMap.of();
    PipelineSpec expected = BatchPipelineSpec.builder().addStage(StageSpec.builder("source", new PluginSpec(BatchSource.PLUGIN_TYPE, "mocksource", emptyMap, ARTIFACT_ID)).addOutput(SCHEMA_A, "t1", "t2", "sink2").build()).addStage(StageSpec.builder("sink1", new PluginSpec(BatchSink.PLUGIN_TYPE, "mocksink", emptyMap, ARTIFACT_ID)).addInputSchema("t3", SCHEMA_B).setErrorSchema(SCHEMA_B).build()).addStage(StageSpec.builder("sink2", new PluginSpec(BatchSink.PLUGIN_TYPE, "mocksink", emptyMap, ARTIFACT_ID)).addInputSchemas(ImmutableMap.of("t1", SCHEMA_A, "t2", SCHEMA_A, "source", SCHEMA_A)).setErrorSchema(SCHEMA_A).build()).addStage(StageSpec.builder("t1", new PluginSpec(Transform.PLUGIN_TYPE, "mockA", emptyMap, ARTIFACT_ID)).addInputSchema("source", SCHEMA_A).addOutput(SCHEMA_A, "t2", "t3", "sink2").setErrorSchema(SCHEMA_B).build()).addStage(StageSpec.builder("t2", new PluginSpec(Transform.PLUGIN_TYPE, "mockA", emptyMap, ARTIFACT_ID)).addInputSchemas(ImmutableMap.of("source", SCHEMA_A, "t1", SCHEMA_A)).addOutput(SCHEMA_A, "t3", "sink2").setErrorSchema(SCHEMA_B).build()).addStage(StageSpec.builder("t3", new PluginSpec(Transform.PLUGIN_TYPE, "mockB", emptyMap, ARTIFACT_ID)).addInputSchemas(ImmutableMap.of("t1", SCHEMA_A, "t2", SCHEMA_A)).addOutput(SCHEMA_B, "sink1").setErrorSchema(SCHEMA_A).build()).addConnections(etlConfig.getConnections()).setResources(etlConfig.getResources()).setDriverResources(new Resources(1024, 1)).setClientResources(new Resources(1024, 1)).setStageLoggingEnabled(etlConfig.isStageLoggingEnabled()).setNumOfRecordsPreview(etlConfig.getNumOfRecordsPreview()).build();
    Assert.assertEquals(expected, actual);
}
Also used : ETLBatchConfig(io.cdap.cdap.etl.proto.v2.ETLBatchConfig) BatchPipelineSpec(io.cdap.cdap.etl.batch.BatchPipelineSpec) PluginSpec(io.cdap.cdap.etl.proto.v2.spec.PluginSpec) ETLStage(io.cdap.cdap.etl.proto.v2.ETLStage) PipelineSpec(io.cdap.cdap.etl.proto.v2.spec.PipelineSpec) BatchPipelineSpec(io.cdap.cdap.etl.batch.BatchPipelineSpec) Resources(io.cdap.cdap.api.Resources) Test(org.junit.Test)

Example 75 with ETLConfig

use of io.cdap.cdap.etl.proto.v2.ETLConfig in project cdap by caskdata.

the class PipelineSpecGeneratorTest method testConflictingInputSchemasCondition.

@Test(expected = IllegalArgumentException.class)
public void testConflictingInputSchemasCondition() throws ValidationException {
    /*
     *           ---- transformA ----
     *           |                  |
     * source ---                   |--- condition -- sink
     *           |                  |
     *           ---- transformB ----
     *
     * sink gets schema A and schema B as input, should fail
     */
    ETLBatchConfig etlConfig = ETLBatchConfig.builder().setTimeSchedule("* * * * *").addStage(new ETLStage("source", MOCK_SOURCE)).addStage(new ETLStage("sink", MOCK_SINK)).addStage(new ETLStage("tA", MOCK_TRANSFORM_A)).addStage(new ETLStage("tB", MOCK_TRANSFORM_B)).addStage(new ETLStage("cond", MOCK_CONDITION)).addConnection("source", "tA").addConnection("source", "tB").addConnection("tA", "cond").addConnection("tB", "cond").addConnection("cond", "sink", true).build();
    specGenerator.generateSpec(etlConfig);
}
Also used : ETLBatchConfig(io.cdap.cdap.etl.proto.v2.ETLBatchConfig) ETLStage(io.cdap.cdap.etl.proto.v2.ETLStage) Test(org.junit.Test)

Aggregations

ETLStage (io.cdap.cdap.etl.proto.v2.ETLStage)84 ETLBatchConfig (io.cdap.cdap.etl.proto.v2.ETLBatchConfig)75 AppRequest (io.cdap.cdap.proto.artifact.AppRequest)59 ApplicationId (io.cdap.cdap.proto.id.ApplicationId)59 Test (org.junit.Test)54 ApplicationManager (io.cdap.cdap.test.ApplicationManager)53 Table (io.cdap.cdap.api.dataset.table.Table)46 StructuredRecord (io.cdap.cdap.api.data.format.StructuredRecord)45 Schema (io.cdap.cdap.api.data.schema.Schema)45 WorkflowManager (io.cdap.cdap.test.WorkflowManager)45 KeyValueTable (io.cdap.cdap.api.dataset.lib.KeyValueTable)35 HashSet (java.util.HashSet)15 ArrayList (java.util.ArrayList)14 HashMap (java.util.HashMap)11 ETLPlugin (io.cdap.cdap.etl.proto.v2.ETLPlugin)9 SpamMessage (io.cdap.cdap.datapipeline.mock.SpamMessage)8 Lineage (io.cdap.cdap.data2.metadata.lineage.Lineage)7 Relation (io.cdap.cdap.data2.metadata.lineage.Relation)7 DatasetFieldLineageSummary (io.cdap.cdap.metadata.DatasetFieldLineageSummary)7 FieldLineageAdmin (io.cdap.cdap.metadata.FieldLineageAdmin)7