Examples with ETLStage - co.cask.cdap.etl.proto.v2.ETLStage

Example 91 with ETLStage

use of co.cask.cdap.etl.proto.v2.ETLStage in project cdap by caskdata.

the class DataStreamsTest method testWindower.

@Test
public void testWindower() throws Exception {
    /*
     * source --> window(width=10,interval=1) --> aggregator --> filter --> sink
     */
    Schema schema = Schema.recordOf("data", Schema.Field.of("x", Schema.of(Schema.Type.STRING)));
    List<StructuredRecord> input = ImmutableList.of(StructuredRecord.builder(schema).set("x", "abc").build(), StructuredRecord.builder(schema).set("x", "abc").build(), StructuredRecord.builder(schema).set("x", "abc").build());
    String sinkName = "windowOut";
    // source sleeps 1 second between outputs
    DataStreamsConfig etlConfig = DataStreamsConfig.builder().addStage(new ETLStage("source", MockSource.getPlugin(schema, input, 1000L))).addStage(new ETLStage("window", Window.getPlugin(30, 1))).addStage(new ETLStage("agg", FieldCountAggregator.getPlugin("x", "string"))).addStage(new ETLStage("filter", StringValueFilterTransform.getPlugin("x", "all"))).addStage(new ETLStage("sink", MockSink.getPlugin(sinkName))).addConnection("source", "window").addConnection("window", "agg").addConnection("agg", "filter").addConnection("filter", "sink").setBatchInterval("1s").build();
    AppRequest<DataStreamsConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
    ApplicationId appId = NamespaceId.DEFAULT.app("WindowerApp");
    ApplicationManager appManager = deployApplication(appId, appRequest);
    SparkManager sparkManager = appManager.getSparkManager(DataStreamsSparkLauncher.NAME);
    sparkManager.start();
    sparkManager.waitForStatus(true, 10, 1);
    // the sink should contain at least one record with count of 3, and no records with more than 3.
    // less than 3 if the window doesn't contain all 3 records yet, but there should eventually be a window
    // that contains all 3.
    final DataSetManager<Table> outputManager = getDataset(sinkName);
    Tasks.waitFor(true, new Callable<Boolean>() {

        @Override
        public Boolean call() throws Exception {
            outputManager.flush();
            boolean sawThree = false;
            for (StructuredRecord record : MockSink.readOutput(outputManager)) {
                long count = record.get("ct");
                if (count == 3L) {
                    sawThree = true;
                }
                Assert.assertTrue(count <= 3L);
            }
            return sawThree;
        }
    }, 2, TimeUnit.MINUTES);
    sparkManager.stop();
}

Also used : ApplicationManager(co.cask.cdap.test.ApplicationManager) SparkManager(co.cask.cdap.test.SparkManager) Table(co.cask.cdap.api.dataset.table.Table) Schema(co.cask.cdap.api.data.schema.Schema) StructuredRecord(co.cask.cdap.api.data.format.StructuredRecord) TimeoutException(java.util.concurrent.TimeoutException) TopicNotFoundException(co.cask.cdap.api.messaging.TopicNotFoundException) DataStreamsConfig(co.cask.cdap.etl.proto.v2.DataStreamsConfig) AppRequest(co.cask.cdap.proto.artifact.AppRequest) ETLStage(co.cask.cdap.etl.proto.v2.ETLStage) ApplicationId(co.cask.cdap.proto.id.ApplicationId) Test(org.junit.Test)

Example 92 with ETLStage

use of co.cask.cdap.etl.proto.v2.ETLStage in project cdap by caskdata.

the class PreviewDataStreamsTest method testDataStreamsPreviewRun.

@Test
public void testDataStreamsPreviewRun() throws Exception {
    PreviewManager previewManager = getPreviewManager();
    String sinkTableName = "singleOutput";
    Schema schema = Schema.recordOf("testRecord", Schema.Field.of("name", Schema.of(Schema.Type.STRING)));
    List<StructuredRecord> records = new ArrayList<>();
    StructuredRecord recordSamuel = StructuredRecord.builder(schema).set("name", "samuel").build();
    StructuredRecord recordBob = StructuredRecord.builder(schema).set("name", "bob").build();
    StructuredRecord recordTest = StructuredRecord.builder(schema).set("name", "test").build();
    records.add(recordSamuel);
    records.add(recordBob);
    records.add(recordTest);
    /*
     * source --> transform -> sink
     */
    DataStreamsConfig etlConfig = DataStreamsConfig.builder().addStage(new ETLStage("source", MockSource.getPlugin(schema, records))).addStage(new ETLStage("transform", IdentityTransform.getPlugin())).addStage(new ETLStage("sink", MockSink.getPlugin(sinkTableName))).addConnection("source", "transform").addConnection("transform", "sink").setNumOfRecordsPreview(100).setBatchInterval("1s").build();
    // Construct the preview config with the program name and program type.
    PreviewConfig previewConfig = new PreviewConfig(DataStreamsSparkLauncher.NAME, ProgramType.SPARK, Collections.<String, String>emptyMap(), 1);
    AppRequest<DataStreamsConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig, previewConfig);
    // Start the preview and get the corresponding PreviewRunner.
    ApplicationId previewId = previewManager.start(NamespaceId.DEFAULT, appRequest);
    final PreviewRunner previewRunner = previewManager.getRunner(previewId);
    // Wait for the preview to be running and wait until the records are processed in the sink.
    Tasks.waitFor(true, new Callable<Boolean>() {

        @Override
        public Boolean call() throws Exception {
            Map<String, List<JsonElement>> data = previewRunner.getData("sink");
            return data != null && data.get(DATA_TRACER_PROPERTY) != null && data.get(DATA_TRACER_PROPERTY).size() == 3;
        }
    }, 1, TimeUnit.MINUTES);
    // check data in source and transform
    checkPreviewStore(previewRunner, "source", 3);
    checkPreviewStore(previewRunner, "transform", 3);
    // Wait for the pipeline to be shutdown by timer.
    TimeUnit.MINUTES.sleep(1);
    Tasks.waitFor(PreviewStatus.Status.KILLED_BY_TIMER, new Callable<PreviewStatus.Status>() {

        @Override
        public PreviewStatus.Status call() throws Exception {
            return previewRunner.getStatus().getStatus();
        }
    }, 1, TimeUnit.MINUTES);
    // Validate the metrics for preview
    validateMetric(3, previewId, "source.records.out", previewRunner);
    validateMetric(3, previewId, "transform.records.in", previewRunner);
    validateMetric(3, previewId, "transform.records.out", previewRunner);
    validateMetric(3, previewId, "sink.records.in", previewRunner);
    validateMetric(3, previewId, "sink.records.out", previewRunner);
    // Check the sink table is not created in the real space.
    DataSetManager<Table> sinkManager = getDataset(sinkTableName);
    Assert.assertNull(sinkManager.get());
}

Also used : PreviewStatus(co.cask.cdap.app.preview.PreviewStatus) Table(co.cask.cdap.api.dataset.table.Table) Schema(co.cask.cdap.api.data.schema.Schema) ArrayList(java.util.ArrayList) StructuredRecord(co.cask.cdap.api.data.format.StructuredRecord) TimeoutException(java.util.concurrent.TimeoutException) DataStreamsConfig(co.cask.cdap.etl.proto.v2.DataStreamsConfig) AppRequest(co.cask.cdap.proto.artifact.AppRequest) PreviewManager(co.cask.cdap.app.preview.PreviewManager) ETLStage(co.cask.cdap.etl.proto.v2.ETLStage) JsonElement(com.google.gson.JsonElement) PreviewRunner(co.cask.cdap.app.preview.PreviewRunner) ApplicationId(co.cask.cdap.proto.id.ApplicationId) Map(java.util.Map) ImmutableMap(com.google.common.collect.ImmutableMap) PreviewConfig(co.cask.cdap.proto.artifact.preview.PreviewConfig) Test(org.junit.Test)

Example 93 with ETLStage

use of co.cask.cdap.etl.proto.v2.ETLStage in project cdap by caskdata.

the class PipelineTest method testTextFileSinkAndDeletePostAction.

@Test
public void testTextFileSinkAndDeletePostAction() throws Exception {
    // create the pipeline config
    String inputName = "sinkTestInput";
    String outputName = "sinkTestOutput";
    String outputDirName = "users";
    ETLStage source = new ETLStage("source", MockSource.getPlugin(inputName));
    Map<String, String> sinkProperties = new HashMap<>();
    sinkProperties.put(TextFileSetSink.Conf.FILESET_NAME, outputName);
    sinkProperties.put(TextFileSetSink.Conf.FIELD_SEPARATOR, "|");
    sinkProperties.put(TextFileSetSink.Conf.OUTPUT_DIR, "${dir}");
    ETLStage sink = new ETLStage("sink", new ETLPlugin(TextFileSetSink.NAME, BatchSink.PLUGIN_TYPE, sinkProperties, null));
    Map<String, String> actionProperties = new HashMap<>();
    actionProperties.put(FilesetDeletePostAction.Conf.FILESET_NAME, outputName);
    // mapreduce writes multiple files to the output directory. Along with the actual output,
    // there are various .crc files that do not contain any of the output content.
    actionProperties.put(FilesetDeletePostAction.Conf.DELETE_REGEX, ".*\\.crc|_SUCCESS");
    actionProperties.put(FilesetDeletePostAction.Conf.DIRECTORY, outputDirName);
    ETLStage postAction = new ETLStage("cleanup", new ETLPlugin(FilesetDeletePostAction.NAME, PostAction.PLUGIN_TYPE, actionProperties, null));
    ETLBatchConfig pipelineConfig = ETLBatchConfig.builder("* * * * *").addStage(source).addStage(sink).addPostAction(postAction).addConnection(source.getName(), sink.getName()).build();
    // create the pipeline
    ApplicationId pipelineId = NamespaceId.DEFAULT.app("textSinkTestPipeline");
    ApplicationManager appManager = deployApplication(pipelineId, new AppRequest<>(APP_ARTIFACT, pipelineConfig));
    // write some data to the input fileset
    Schema inputSchema = Schema.recordOf("test", Schema.Field.of("name", Schema.of(Schema.Type.STRING)), Schema.Field.of("item", Schema.of(Schema.Type.STRING)));
    Map<String, String> users = new HashMap<>();
    users.put("samuel", "wallet");
    users.put("dwayne", "rock");
    users.put("christopher", "cowbell");
    List<StructuredRecord> inputRecords = new ArrayList<>();
    for (Map.Entry<String, String> userEntry : users.entrySet()) {
        String name = userEntry.getKey();
        String item = userEntry.getValue();
        inputRecords.add(StructuredRecord.builder(inputSchema).set("name", name).set("item", item).build());
    }
    DataSetManager<Table> inputManager = getDataset(inputName);
    MockSource.writeInput(inputManager, inputRecords);
    // run the pipeline
    Map<String, String> runtimeArgs = new HashMap<>();
    // the ${dir} macro will be substituted with "users" for our pipeline run
    runtimeArgs.put("dir", outputDirName);
    WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
    workflowManager.start(runtimeArgs);
    workflowManager.waitForFinish(4, TimeUnit.MINUTES);
    // check the pipeline output
    DataSetManager<FileSet> outputManager = getDataset(outputName);
    FileSet output = outputManager.get();
    Location outputDir = output.getBaseLocation().append(outputDirName);
    Map<String, String> actual = new HashMap<>();
    for (Location outputFile : outputDir.list()) {
        if (outputFile.getName().endsWith(".crc") || "_SUCCESS".equals(outputFile.getName())) {
            Assert.fail("Post action did not delete file " + outputFile.getName());
        }
        try (BufferedReader reader = new BufferedReader(new InputStreamReader(outputFile.getInputStream()))) {
            String line;
            while ((line = reader.readLine()) != null) {
                String[] parts = line.split("\\|");
                actual.put(parts[0], parts[1]);
            }
        }
    }
    Assert.assertEquals(actual, users);
}

Also used : ApplicationManager(co.cask.cdap.test.ApplicationManager) Table(co.cask.cdap.api.dataset.table.Table) KeyValueTable(co.cask.cdap.api.dataset.lib.KeyValueTable) FileSet(co.cask.cdap.api.dataset.lib.FileSet) InputStreamReader(java.io.InputStreamReader) HashMap(java.util.HashMap) Schema(co.cask.cdap.api.data.schema.Schema) WorkflowManager(co.cask.cdap.test.WorkflowManager) ArrayList(java.util.ArrayList) ETLPlugin(co.cask.cdap.etl.proto.v2.ETLPlugin) StructuredRecord(co.cask.cdap.api.data.format.StructuredRecord) ETLBatchConfig(co.cask.cdap.etl.proto.v2.ETLBatchConfig) ETLStage(co.cask.cdap.etl.proto.v2.ETLStage) BufferedReader(java.io.BufferedReader) ApplicationId(co.cask.cdap.proto.id.ApplicationId) HashMap(java.util.HashMap) Map(java.util.Map) Location(org.apache.twill.filesystem.Location) Test(org.junit.Test)

Example 94 with ETLStage

use of co.cask.cdap.etl.proto.v2.ETLStage in project cdap by caskdata.

the class PipelineTest method testWordCountSparkSink.

@SuppressWarnings("ConstantConditions")
@Test
public void testWordCountSparkSink() throws Exception {
    String inputName = "sparkSinkInput";
    String outputName = "sparkSinkOutput";
    // create the pipeline config
    ETLStage source = new ETLStage("source", MockSource.getPlugin(inputName));
    Map<String, String> sinkProperties = new HashMap<>();
    sinkProperties.put("field", "text");
    sinkProperties.put("tableName", outputName);
    ETLStage sink = new ETLStage("sink", new ETLPlugin(WordCountSink.NAME, SparkSink.PLUGIN_TYPE, sinkProperties, null));
    ETLBatchConfig pipelineConfig = ETLBatchConfig.builder("* * * * *").addStage(source).addStage(sink).addConnection(source.getName(), sink.getName()).build();
    // create the pipeline
    ApplicationId pipelineId = NamespaceId.DEFAULT.app("sparkSinkTestPipeline");
    ApplicationManager appManager = deployApplication(pipelineId, new AppRequest<>(APP_ARTIFACT, pipelineConfig));
    // write the input
    Schema inputSchema = Schema.recordOf("text", Schema.Field.of("text", Schema.of(Schema.Type.STRING)));
    DataSetManager<Table> inputManager = getDataset(inputName);
    List<StructuredRecord> inputRecords = new ArrayList<>();
    inputRecords.add(StructuredRecord.builder(inputSchema).set("text", "Hello World").build());
    inputRecords.add(StructuredRecord.builder(inputSchema).set("text", "Hello my name is Hal").build());
    inputRecords.add(StructuredRecord.builder(inputSchema).set("text", "Hello my name is Sam").build());
    MockSource.writeInput(inputManager, inputRecords);
    WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
    workflowManager.start();
    workflowManager.waitForFinish(4, TimeUnit.MINUTES);
    DataSetManager<KeyValueTable> outputManager = getDataset(outputName);
    KeyValueTable output = outputManager.get();
    Assert.assertEquals(3L, Bytes.toLong(output.read("Hello")));
    Assert.assertEquals(1L, Bytes.toLong(output.read("World")));
    Assert.assertEquals(2L, Bytes.toLong(output.read("my")));
    Assert.assertEquals(2L, Bytes.toLong(output.read("name")));
    Assert.assertEquals(2L, Bytes.toLong(output.read("is")));
    Assert.assertEquals(1L, Bytes.toLong(output.read("Hal")));
    Assert.assertEquals(1L, Bytes.toLong(output.read("Sam")));
}

Also used : ApplicationManager(co.cask.cdap.test.ApplicationManager) Table(co.cask.cdap.api.dataset.table.Table) KeyValueTable(co.cask.cdap.api.dataset.lib.KeyValueTable) HashMap(java.util.HashMap) Schema(co.cask.cdap.api.data.schema.Schema) WorkflowManager(co.cask.cdap.test.WorkflowManager) ArrayList(java.util.ArrayList) ETLPlugin(co.cask.cdap.etl.proto.v2.ETLPlugin) StructuredRecord(co.cask.cdap.api.data.format.StructuredRecord) ETLBatchConfig(co.cask.cdap.etl.proto.v2.ETLBatchConfig) ETLStage(co.cask.cdap.etl.proto.v2.ETLStage) KeyValueTable(co.cask.cdap.api.dataset.lib.KeyValueTable) ApplicationId(co.cask.cdap.proto.id.ApplicationId) Test(org.junit.Test)

Example 95 with ETLStage

use of co.cask.cdap.etl.proto.v2.ETLStage in project cdap by caskdata.

the class PipelineTest method testTextFileSourceAndMoveAction.

@Test
public void testTextFileSourceAndMoveAction() throws Exception {
    // create the pipeline config
    String moveFromName = "sourceTestMoveFrom";
    String inputName = "sourceTestInput";
    String outputName = "sourceTestOutput";
    Map<String, String> actionProperties = new HashMap<>();
    actionProperties.put(FilesetMoveAction.Conf.SOURCE_FILESET, "sourceTestMoveFrom");
    actionProperties.put(FilesetMoveAction.Conf.DEST_FILESET, inputName);
    ETLStage moveAction = new ETLStage("moveInput", new ETLPlugin(FilesetMoveAction.NAME, Action.PLUGIN_TYPE, actionProperties, null));
    Map<String, String> sourceProperties = new HashMap<>();
    sourceProperties.put(TextFileSetSource.Conf.FILESET_NAME, inputName);
    sourceProperties.put(TextFileSetSource.Conf.CREATE_IF_NOT_EXISTS, "true");
    sourceProperties.put(TextFileSetSource.Conf.DELETE_INPUT_ON_SUCCESS, "true");
    sourceProperties.put(TextFileSetSource.Conf.FILES, "${file}");
    ETLStage source = new ETLStage("source", new ETLPlugin(TextFileSetSource.NAME, BatchSource.PLUGIN_TYPE, sourceProperties, null));
    ETLStage sink = new ETLStage("sink", MockSink.getPlugin(outputName));
    ETLBatchConfig pipelineConfig = ETLBatchConfig.builder("* * * * *").addStage(source).addStage(sink).addStage(moveAction).addConnection(moveAction.getName(), source.getName()).addConnection(source.getName(), sink.getName()).build();
    // create the move from fileset
    addDatasetInstance(FileSet.class.getName(), moveFromName);
    // create the pipeline
    ApplicationId pipelineId = NamespaceId.DEFAULT.app("textSourceTestPipeline");
    ApplicationManager appManager = deployApplication(pipelineId, new AppRequest<>(APP_ARTIFACT, pipelineConfig));
    // write some files that will be moved to the input fileset
    DataSetManager<FileSet> moveFromManager = getDataset(moveFromName);
    // this file starts with '.' and should be ignored.
    Location invisibleFile = moveFromManager.get().getBaseLocation().append(".hidden");
    try (OutputStream outputStream = invisibleFile.getOutputStream()) {
        outputStream.write(Bytes.toBytes("this should not be read"));
    }
    // this file should be moved
    String line1 = "Hello World!";
    String line2 = "Good to meet you";
    String line3 = "My name is Hal";
    String inputText = line1 + "\n" + line2 + "\n" + line3;
    Location inputFile = moveFromManager.get().getBaseLocation().append("inputFile");
    try (OutputStream outputStream = inputFile.getOutputStream()) {
        outputStream.write(Bytes.toBytes(inputText));
    }
    // run the pipeline
    Map<String, String> runtimeArgs = new HashMap<>();
    // the ${file} macro will be substituted with "inputFile" for our pipeline run
    runtimeArgs.put("file", "inputFile");
    WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
    workflowManager.start(runtimeArgs);
    workflowManager.waitForFinish(4, TimeUnit.MINUTES);
    // check the pipeline output
    DataSetManager<Table> outputManager = getDataset(outputName);
    Set<StructuredRecord> outputRecords = new HashSet<>();
    outputRecords.addAll(MockSink.readOutput(outputManager));
    Set<StructuredRecord> expected = new HashSet<>();
    expected.add(StructuredRecord.builder(TextFileSetSource.OUTPUT_SCHEMA).set("position", (long) inputText.indexOf(line1)).set("text", line1).build());
    expected.add(StructuredRecord.builder(TextFileSetSource.OUTPUT_SCHEMA).set("position", (long) inputText.indexOf(line2)).set("text", line2).build());
    expected.add(StructuredRecord.builder(TextFileSetSource.OUTPUT_SCHEMA).set("position", (long) inputText.indexOf(line3)).set("text", line3).build());
    Assert.assertEquals(expected, outputRecords);
    // check that the input file does not exist in the moveFrom fileSet,
    // and was deleted by the source in the input fileSet
    Assert.assertFalse(moveFromManager.get().getBaseLocation().append("inputFile").exists());
    DataSetManager<FileSet> inputManager = getDataset(inputName);
    Assert.assertFalse(inputManager.get().getBaseLocation().append("inputFile").exists());
}

Also used : ApplicationManager(co.cask.cdap.test.ApplicationManager) Table(co.cask.cdap.api.dataset.table.Table) KeyValueTable(co.cask.cdap.api.dataset.lib.KeyValueTable) FileSet(co.cask.cdap.api.dataset.lib.FileSet) HashMap(java.util.HashMap) OutputStream(java.io.OutputStream) WorkflowManager(co.cask.cdap.test.WorkflowManager) ETLPlugin(co.cask.cdap.etl.proto.v2.ETLPlugin) StructuredRecord(co.cask.cdap.api.data.format.StructuredRecord) ETLBatchConfig(co.cask.cdap.etl.proto.v2.ETLBatchConfig) ETLStage(co.cask.cdap.etl.proto.v2.ETLStage) ApplicationId(co.cask.cdap.proto.id.ApplicationId) Location(org.apache.twill.filesystem.Location) HashSet(java.util.HashSet) Test(org.junit.Test)

Aggregations

ETLStage (co.cask.cdap.etl.proto.v2.ETLStage)94 ETLBatchConfig (co.cask.cdap.etl.proto.v2.ETLBatchConfig)75 Test (org.junit.Test)64 ApplicationId (co.cask.cdap.proto.id.ApplicationId)62 ApplicationManager (co.cask.cdap.test.ApplicationManager)58 AppRequest (co.cask.cdap.proto.artifact.AppRequest)57 Schema (co.cask.cdap.api.data.schema.Schema)51 StructuredRecord (co.cask.cdap.api.data.format.StructuredRecord)50 Table (co.cask.cdap.api.dataset.table.Table)49 WorkflowManager (co.cask.cdap.test.WorkflowManager)44 KeyValueTable (co.cask.cdap.api.dataset.lib.KeyValueTable)39 HashSet (java.util.HashSet)16 ArrayList (java.util.ArrayList)15 ETLPlugin (co.cask.cdap.etl.proto.v2.ETLPlugin)14 HashMap (java.util.HashMap)14 DataStreamsConfig (co.cask.cdap.etl.proto.v2.DataStreamsConfig)11 TimeoutException (java.util.concurrent.TimeoutException)11 TopicNotFoundException (co.cask.cdap.api.messaging.TopicNotFoundException)7 SparkManager (co.cask.cdap.test.SparkManager)7 BatchPipelineSpec (co.cask.cdap.etl.batch.BatchPipelineSpec)6