Search in sources :

Example 46 with DataStreamsConfig

use of io.cdap.cdap.etl.proto.v2.DataStreamsConfig in project cdap by cdapio.

the class DataStreamsApp method configure.

@Override
public void configure() {
    DataStreamsConfig config = getConfig();
    setDescription(Objects.firstNonNull(config.getDescription(), "Data Streams Application"));
    DataStreamsPipelineSpec spec;
    try {
        spec = new DataStreamsPipelineSpecGenerator(getConfigurer().getDeployedNamespace(), getConfigurer(), getConfigurer().getRuntimeConfigurer(), ImmutableSet.of(StreamingSource.PLUGIN_TYPE), ImmutableSet.of(BatchSink.PLUGIN_TYPE, SparkSink.PLUGIN_TYPE, AlertPublisher.PLUGIN_TYPE), getConfigurer()).generateSpec(config);
    } catch (ValidationException e) {
        throw new IllegalArgumentException(String.format("Failed to configure pipeline: %s", e.getFailures().isEmpty() ? e.getMessage() : e.getFailures().iterator().next().getFullMessage()), e);
    }
    addSpark(new DataStreamsSparkLauncher(spec));
}
Also used : ValidationException(io.cdap.cdap.etl.api.validation.ValidationException) DataStreamsConfig(io.cdap.cdap.etl.proto.v2.DataStreamsConfig)

Example 47 with DataStreamsConfig

use of io.cdap.cdap.etl.proto.v2.DataStreamsConfig in project cdap by cdapio.

the class DataStreamsTest method testErrorTransform.

@Test
public void testErrorTransform() throws Exception {
    Schema inputSchema = Schema.recordOf("user", Schema.Field.of("name", Schema.of(Schema.Type.STRING)));
    List<StructuredRecord> input = ImmutableList.of(StructuredRecord.builder(inputSchema).set("name", "Leo").build(), StructuredRecord.builder(inputSchema).set("name", "Ralph").build(), StructuredRecord.builder(inputSchema).set("name", "Don").build(), StructuredRecord.builder(inputSchema).set("name", "Mike").build(), StructuredRecord.builder(inputSchema).set("name", "April").build());
    /*
     *
     * source--> filter1 --> filter2 --> agg1 --> agg2
     *              |           |         |        |
     *              |-----------|---------|--------|--------|--> flatten errors --> sink1
     *                                                      |
     *                                                      |--> filter errors --> sink2
     * arrows coming out the right represent output records
     * arrows coming out the bottom represent error records
     * this will test multiple stages from multiple phases emitting errors to the same stage
     * as well as errors from one stage going to multiple stages
     */
    File outputDir = TMP_FOLDER.newFolder();
    String output1 = new File(outputDir, "output1").getAbsolutePath();
    String output2 = new File(outputDir, "output2").getAbsolutePath();
    DataStreamsConfig config = DataStreamsConfig.builder().setBatchInterval("5s").addStage(new ETLStage("source", MockSource.getPlugin(inputSchema, input))).addStage(new ETLStage("filter1", StringValueFilterTransform.getPlugin("name", "Leo"))).addStage(new ETLStage("filter2", StringValueFilterTransform.getPlugin("name", "Ralph"))).addStage(new ETLStage("agg1", GroupFilterAggregator.getPlugin("name", "Don"))).addStage(new ETLStage("agg2", GroupFilterAggregator.getPlugin("name", "Mike"))).addStage(new ETLStage("errorflatten", FlattenErrorTransform.getPlugin())).addStage(new ETLStage("errorfilter", FilterErrorTransform.getPlugin(3))).addStage(new ETLStage("sink1", MockExternalSink.getPlugin(UUID.randomUUID().toString(), "sink1", output1))).addStage(new ETLStage("sink2", MockExternalSink.getPlugin(UUID.randomUUID().toString(), "sink2", output2))).addConnection("source", "filter1").addConnection("filter1", "filter2").addConnection("filter2", "agg1").addConnection("agg1", "agg2").addConnection("filter1", "errorflatten").addConnection("filter1", "errorfilter").addConnection("filter2", "errorflatten").addConnection("filter2", "errorfilter").addConnection("agg1", "errorflatten").addConnection("agg1", "errorfilter").addConnection("agg2", "errorflatten").addConnection("agg2", "errorfilter").addConnection("errorflatten", "sink1").addConnection("errorfilter", "sink2").setCheckpointDir(checkpointDir).build();
    AppRequest<DataStreamsConfig> appRequest = new AppRequest<>(APP_ARTIFACT, config);
    ApplicationId appId = NamespaceId.DEFAULT.app("ErrTransformTest");
    ApplicationManager appManager = deployApplication(appId, appRequest);
    Map<String, String> args = Collections.singletonMap(io.cdap.cdap.etl.common.Constants.CONSOLIDATE_STAGES, "true");
    SparkManager sparkManager = appManager.getSparkManager(DataStreamsSparkLauncher.NAME);
    sparkManager.startAndWaitForGoodRun(args, ProgramRunStatus.RUNNING, 10, TimeUnit.SECONDS);
    Schema flattenSchema = Schema.recordOf("erroruser", Schema.Field.of("name", Schema.of(Schema.Type.STRING)), Schema.Field.of("errMsg", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("errCode", Schema.nullableOf(Schema.of(Schema.Type.INT))), Schema.Field.of("errStage", Schema.nullableOf(Schema.of(Schema.Type.STRING))));
    Set<StructuredRecord> expected = ImmutableSet.of(StructuredRecord.builder(flattenSchema).set("name", "Leo").set("errMsg", "bad string value").set("errCode", 1).set("errStage", "filter1").build(), StructuredRecord.builder(flattenSchema).set("name", "Ralph").set("errMsg", "bad string value").set("errCode", 1).set("errStage", "filter2").build(), StructuredRecord.builder(flattenSchema).set("name", "Don").set("errMsg", "bad val").set("errCode", 3).set("errStage", "agg1").build(), StructuredRecord.builder(flattenSchema).set("name", "Mike").set("errMsg", "bad val").set("errCode", 3).set("errStage", "agg2").build());
    Tasks.waitFor(true, () -> {
        Set<StructuredRecord> outputRecords = new HashSet<>(MockExternalSink.readOutput(output1, flattenSchema));
        return expected.equals(outputRecords);
    }, 4, TimeUnit.MINUTES);
    Set<StructuredRecord> expected2 = ImmutableSet.of(StructuredRecord.builder(inputSchema).set("name", "Leo").build(), StructuredRecord.builder(inputSchema).set("name", "Ralph").build());
    Tasks.waitFor(true, () -> {
        Set<StructuredRecord> outputRecords = new HashSet<>(MockExternalSink.readOutput(output2, inputSchema));
        return expected2.equals(outputRecords);
    }, 4, TimeUnit.MINUTES);
}
Also used : ApplicationManager(io.cdap.cdap.test.ApplicationManager) SparkManager(io.cdap.cdap.test.SparkManager) Schema(io.cdap.cdap.api.data.schema.Schema) StructuredRecord(io.cdap.cdap.api.data.format.StructuredRecord) DataStreamsConfig(io.cdap.cdap.etl.proto.v2.DataStreamsConfig) AppRequest(io.cdap.cdap.proto.artifact.AppRequest) ETLStage(io.cdap.cdap.etl.proto.v2.ETLStage) ApplicationId(io.cdap.cdap.proto.id.ApplicationId) File(java.io.File) HashSet(java.util.HashSet) Test(org.junit.Test)

Example 48 with DataStreamsConfig

use of io.cdap.cdap.etl.proto.v2.DataStreamsConfig in project cdap by cdapio.

the class PreviewDataStreamsTest method testDataStreamsPreviewRun.

@Test
public void testDataStreamsPreviewRun() throws Exception {
    PreviewManager previewManager = getPreviewManager();
    String sinkTableName = "singleOutput";
    Schema schema = Schema.recordOf("testRecord", Schema.Field.of("name", Schema.of(Schema.Type.STRING)));
    List<StructuredRecord> records = new ArrayList<>();
    StructuredRecord recordSamuel = StructuredRecord.builder(schema).set("name", "samuel").build();
    StructuredRecord recordBob = StructuredRecord.builder(schema).set("name", "bob").build();
    StructuredRecord recordTest = StructuredRecord.builder(schema).set("name", "test").build();
    records.add(recordSamuel);
    records.add(recordBob);
    records.add(recordTest);
    /*
     * source --> transform -> sink
     */
    DataStreamsConfig etlConfig = DataStreamsConfig.builder().addStage(new ETLStage("source", MockSource.getPlugin(schema, records))).addStage(new ETLStage("transform", IdentityTransform.getPlugin())).addStage(new ETLStage("sink", MockSink.getPlugin(sinkTableName))).addConnection("source", "transform").addConnection("transform", "sink").setNumOfRecordsPreview(100).setBatchInterval("1s").setCheckpointDir("file://" + TMP_FOLDER.getRoot().toPath().toString()).build();
    // Construct the preview config with the program name and program type.
    PreviewConfig previewConfig = new PreviewConfig(DataStreamsSparkLauncher.NAME, ProgramType.SPARK, Collections.<String, String>emptyMap(), 1);
    AppRequest<DataStreamsConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig, previewConfig);
    // Start the preview and get the corresponding PreviewRunner.
    ApplicationId previewId = previewManager.start(NamespaceId.DEFAULT, appRequest);
    // Wait for the preview to be running and wait until the records are processed in the sink.
    Tasks.waitFor(true, new Callable<Boolean>() {

        @Override
        public Boolean call() throws Exception {
            Map<String, List<JsonElement>> data = previewManager.getData(previewId, "sink");
            return data != null && data.get(DATA_TRACER_PROPERTY) != null && data.get(DATA_TRACER_PROPERTY).size() == 3;
        }
    }, 1, TimeUnit.MINUTES);
    // check data in source and transform
    checkPreviewStore(previewManager, previewId, "source", 3);
    checkPreviewStore(previewManager, previewId, "transform", 3);
    // Wait for the pipeline to be shutdown by timer.
    TimeUnit.MINUTES.sleep(1);
    Tasks.waitFor(PreviewStatus.Status.KILLED_BY_TIMER, new Callable<PreviewStatus.Status>() {

        @Override
        public PreviewStatus.Status call() throws Exception {
            return previewManager.getStatus(previewId).getStatus();
        }
    }, 1, TimeUnit.MINUTES);
    // Validate the metrics for preview
    validateMetric(3, previewId, "source.records.out", previewManager);
    validateMetric(3, previewId, "transform.records.in", previewManager);
    validateMetric(3, previewId, "transform.records.out", previewManager);
    validateMetric(3, previewId, "sink.records.in", previewManager);
    validateMetric(3, previewId, "sink.records.out", previewManager);
    // Check the sink table is not created in the real space.
    DataSetManager<Table> sinkManager = getDataset(sinkTableName);
    Assert.assertNull(sinkManager.get());
}
Also used : PreviewStatus(io.cdap.cdap.app.preview.PreviewStatus) Table(io.cdap.cdap.api.dataset.table.Table) Schema(io.cdap.cdap.api.data.schema.Schema) ArrayList(java.util.ArrayList) StructuredRecord(io.cdap.cdap.api.data.format.StructuredRecord) TimeoutException(java.util.concurrent.TimeoutException) NotFoundException(io.cdap.cdap.common.NotFoundException) DataStreamsConfig(io.cdap.cdap.etl.proto.v2.DataStreamsConfig) AppRequest(io.cdap.cdap.proto.artifact.AppRequest) PreviewManager(io.cdap.cdap.app.preview.PreviewManager) ETLStage(io.cdap.cdap.etl.proto.v2.ETLStage) JsonElement(com.google.gson.JsonElement) ApplicationId(io.cdap.cdap.proto.id.ApplicationId) Map(java.util.Map) ImmutableMap(com.google.common.collect.ImmutableMap) PreviewConfig(io.cdap.cdap.proto.artifact.preview.PreviewConfig) Test(org.junit.Test)

Aggregations

DataStreamsConfig (io.cdap.cdap.etl.proto.v2.DataStreamsConfig)36 ETLStage (io.cdap.cdap.etl.proto.v2.ETLStage)34 Test (org.junit.Test)34 StructuredRecord (io.cdap.cdap.api.data.format.StructuredRecord)30 AppRequest (io.cdap.cdap.proto.artifact.AppRequest)30 ApplicationId (io.cdap.cdap.proto.id.ApplicationId)30 HashSet (java.util.HashSet)30 Schema (io.cdap.cdap.api.data.schema.Schema)29 ApplicationManager (io.cdap.cdap.test.ApplicationManager)28 SparkManager (io.cdap.cdap.test.SparkManager)22 Table (io.cdap.cdap.api.dataset.table.Table)18 DataStreamsConfig (co.cask.cdap.etl.proto.v2.DataStreamsConfig)12 TimeoutException (java.util.concurrent.TimeoutException)12 ETLStage (co.cask.cdap.etl.proto.v2.ETLStage)11 StructuredRecord (co.cask.cdap.api.data.format.StructuredRecord)10 Schema (co.cask.cdap.api.data.schema.Schema)10 AppRequest (co.cask.cdap.proto.artifact.AppRequest)10 ApplicationId (co.cask.cdap.proto.id.ApplicationId)10 ApplicationManager (co.cask.cdap.test.ApplicationManager)9 ArrayList (java.util.ArrayList)9