Search in sources :

Example 56 with WorkflowManager

use of co.cask.cdap.test.WorkflowManager in project cdap by caskdata.

the class PipelineTest method testTextFileSinkAndDeletePostAction.

@Test
public void testTextFileSinkAndDeletePostAction() throws Exception {
    // create the pipeline config
    String inputName = "sinkTestInput";
    String outputName = "sinkTestOutput";
    String outputDirName = "users";
    ETLStage source = new ETLStage("source", MockSource.getPlugin(inputName));
    Map<String, String> sinkProperties = new HashMap<>();
    sinkProperties.put(TextFileSetSink.Conf.FILESET_NAME, outputName);
    sinkProperties.put(TextFileSetSink.Conf.FIELD_SEPARATOR, "|");
    sinkProperties.put(TextFileSetSink.Conf.OUTPUT_DIR, "${dir}");
    ETLStage sink = new ETLStage("sink", new ETLPlugin(TextFileSetSink.NAME, BatchSink.PLUGIN_TYPE, sinkProperties, null));
    Map<String, String> actionProperties = new HashMap<>();
    actionProperties.put(FilesetDeletePostAction.Conf.FILESET_NAME, outputName);
    // mapreduce writes multiple files to the output directory. Along with the actual output,
    // there are various .crc files that do not contain any of the output content.
    actionProperties.put(FilesetDeletePostAction.Conf.DELETE_REGEX, ".*\\.crc|_SUCCESS");
    actionProperties.put(FilesetDeletePostAction.Conf.DIRECTORY, outputDirName);
    ETLStage postAction = new ETLStage("cleanup", new ETLPlugin(FilesetDeletePostAction.NAME, PostAction.PLUGIN_TYPE, actionProperties, null));
    ETLBatchConfig pipelineConfig = ETLBatchConfig.builder("* * * * *").addStage(source).addStage(sink).addPostAction(postAction).addConnection(source.getName(), sink.getName()).build();
    // create the pipeline
    ApplicationId pipelineId = NamespaceId.DEFAULT.app("textSinkTestPipeline");
    ApplicationManager appManager = deployApplication(pipelineId, new AppRequest<>(APP_ARTIFACT, pipelineConfig));
    // write some data to the input fileset
    Schema inputSchema = Schema.recordOf("test", Schema.Field.of("name", Schema.of(Schema.Type.STRING)), Schema.Field.of("item", Schema.of(Schema.Type.STRING)));
    Map<String, String> users = new HashMap<>();
    users.put("samuel", "wallet");
    users.put("dwayne", "rock");
    users.put("christopher", "cowbell");
    List<StructuredRecord> inputRecords = new ArrayList<>();
    for (Map.Entry<String, String> userEntry : users.entrySet()) {
        String name = userEntry.getKey();
        String item = userEntry.getValue();
        inputRecords.add(StructuredRecord.builder(inputSchema).set("name", name).set("item", item).build());
    }
    DataSetManager<Table> inputManager = getDataset(inputName);
    MockSource.writeInput(inputManager, inputRecords);
    // run the pipeline
    Map<String, String> runtimeArgs = new HashMap<>();
    // the ${dir} macro will be substituted with "users" for our pipeline run
    runtimeArgs.put("dir", outputDirName);
    WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
    workflowManager.start(runtimeArgs);
    workflowManager.waitForFinish(4, TimeUnit.MINUTES);
    // check the pipeline output
    DataSetManager<FileSet> outputManager = getDataset(outputName);
    FileSet output = outputManager.get();
    Location outputDir = output.getBaseLocation().append(outputDirName);
    Map<String, String> actual = new HashMap<>();
    for (Location outputFile : outputDir.list()) {
        if (outputFile.getName().endsWith(".crc") || "_SUCCESS".equals(outputFile.getName())) {
            Assert.fail("Post action did not delete file " + outputFile.getName());
        }
        try (BufferedReader reader = new BufferedReader(new InputStreamReader(outputFile.getInputStream()))) {
            String line;
            while ((line = reader.readLine()) != null) {
                String[] parts = line.split("\\|");
                actual.put(parts[0], parts[1]);
            }
        }
    }
    Assert.assertEquals(actual, users);
}
Also used : ApplicationManager(co.cask.cdap.test.ApplicationManager) Table(co.cask.cdap.api.dataset.table.Table) KeyValueTable(co.cask.cdap.api.dataset.lib.KeyValueTable) FileSet(co.cask.cdap.api.dataset.lib.FileSet) InputStreamReader(java.io.InputStreamReader) HashMap(java.util.HashMap) Schema(co.cask.cdap.api.data.schema.Schema) WorkflowManager(co.cask.cdap.test.WorkflowManager) ArrayList(java.util.ArrayList) ETLPlugin(co.cask.cdap.etl.proto.v2.ETLPlugin) StructuredRecord(co.cask.cdap.api.data.format.StructuredRecord) ETLBatchConfig(co.cask.cdap.etl.proto.v2.ETLBatchConfig) ETLStage(co.cask.cdap.etl.proto.v2.ETLStage) BufferedReader(java.io.BufferedReader) ApplicationId(co.cask.cdap.proto.id.ApplicationId) HashMap(java.util.HashMap) Map(java.util.Map) Location(org.apache.twill.filesystem.Location) Test(org.junit.Test)

Example 57 with WorkflowManager

use of co.cask.cdap.test.WorkflowManager in project cdap by caskdata.

the class PipelineTest method testWordCountSparkSink.

@SuppressWarnings("ConstantConditions")
@Test
public void testWordCountSparkSink() throws Exception {
    String inputName = "sparkSinkInput";
    String outputName = "sparkSinkOutput";
    // create the pipeline config
    ETLStage source = new ETLStage("source", MockSource.getPlugin(inputName));
    Map<String, String> sinkProperties = new HashMap<>();
    sinkProperties.put("field", "text");
    sinkProperties.put("tableName", outputName);
    ETLStage sink = new ETLStage("sink", new ETLPlugin(WordCountSink.NAME, SparkSink.PLUGIN_TYPE, sinkProperties, null));
    ETLBatchConfig pipelineConfig = ETLBatchConfig.builder("* * * * *").addStage(source).addStage(sink).addConnection(source.getName(), sink.getName()).build();
    // create the pipeline
    ApplicationId pipelineId = NamespaceId.DEFAULT.app("sparkSinkTestPipeline");
    ApplicationManager appManager = deployApplication(pipelineId, new AppRequest<>(APP_ARTIFACT, pipelineConfig));
    // write the input
    Schema inputSchema = Schema.recordOf("text", Schema.Field.of("text", Schema.of(Schema.Type.STRING)));
    DataSetManager<Table> inputManager = getDataset(inputName);
    List<StructuredRecord> inputRecords = new ArrayList<>();
    inputRecords.add(StructuredRecord.builder(inputSchema).set("text", "Hello World").build());
    inputRecords.add(StructuredRecord.builder(inputSchema).set("text", "Hello my name is Hal").build());
    inputRecords.add(StructuredRecord.builder(inputSchema).set("text", "Hello my name is Sam").build());
    MockSource.writeInput(inputManager, inputRecords);
    WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
    workflowManager.start();
    workflowManager.waitForFinish(4, TimeUnit.MINUTES);
    DataSetManager<KeyValueTable> outputManager = getDataset(outputName);
    KeyValueTable output = outputManager.get();
    Assert.assertEquals(3L, Bytes.toLong(output.read("Hello")));
    Assert.assertEquals(1L, Bytes.toLong(output.read("World")));
    Assert.assertEquals(2L, Bytes.toLong(output.read("my")));
    Assert.assertEquals(2L, Bytes.toLong(output.read("name")));
    Assert.assertEquals(2L, Bytes.toLong(output.read("is")));
    Assert.assertEquals(1L, Bytes.toLong(output.read("Hal")));
    Assert.assertEquals(1L, Bytes.toLong(output.read("Sam")));
}
Also used : ApplicationManager(co.cask.cdap.test.ApplicationManager) Table(co.cask.cdap.api.dataset.table.Table) KeyValueTable(co.cask.cdap.api.dataset.lib.KeyValueTable) HashMap(java.util.HashMap) Schema(co.cask.cdap.api.data.schema.Schema) WorkflowManager(co.cask.cdap.test.WorkflowManager) ArrayList(java.util.ArrayList) ETLPlugin(co.cask.cdap.etl.proto.v2.ETLPlugin) StructuredRecord(co.cask.cdap.api.data.format.StructuredRecord) ETLBatchConfig(co.cask.cdap.etl.proto.v2.ETLBatchConfig) ETLStage(co.cask.cdap.etl.proto.v2.ETLStage) KeyValueTable(co.cask.cdap.api.dataset.lib.KeyValueTable) ApplicationId(co.cask.cdap.proto.id.ApplicationId) Test(org.junit.Test)

Example 58 with WorkflowManager

use of co.cask.cdap.test.WorkflowManager in project cdap by caskdata.

the class PipelineTest method testTextFileSourceAndMoveAction.

@Test
public void testTextFileSourceAndMoveAction() throws Exception {
    // create the pipeline config
    String moveFromName = "sourceTestMoveFrom";
    String inputName = "sourceTestInput";
    String outputName = "sourceTestOutput";
    Map<String, String> actionProperties = new HashMap<>();
    actionProperties.put(FilesetMoveAction.Conf.SOURCE_FILESET, "sourceTestMoveFrom");
    actionProperties.put(FilesetMoveAction.Conf.DEST_FILESET, inputName);
    ETLStage moveAction = new ETLStage("moveInput", new ETLPlugin(FilesetMoveAction.NAME, Action.PLUGIN_TYPE, actionProperties, null));
    Map<String, String> sourceProperties = new HashMap<>();
    sourceProperties.put(TextFileSetSource.Conf.FILESET_NAME, inputName);
    sourceProperties.put(TextFileSetSource.Conf.CREATE_IF_NOT_EXISTS, "true");
    sourceProperties.put(TextFileSetSource.Conf.DELETE_INPUT_ON_SUCCESS, "true");
    sourceProperties.put(TextFileSetSource.Conf.FILES, "${file}");
    ETLStage source = new ETLStage("source", new ETLPlugin(TextFileSetSource.NAME, BatchSource.PLUGIN_TYPE, sourceProperties, null));
    ETLStage sink = new ETLStage("sink", MockSink.getPlugin(outputName));
    ETLBatchConfig pipelineConfig = ETLBatchConfig.builder("* * * * *").addStage(source).addStage(sink).addStage(moveAction).addConnection(moveAction.getName(), source.getName()).addConnection(source.getName(), sink.getName()).build();
    // create the move from fileset
    addDatasetInstance(FileSet.class.getName(), moveFromName);
    // create the pipeline
    ApplicationId pipelineId = NamespaceId.DEFAULT.app("textSourceTestPipeline");
    ApplicationManager appManager = deployApplication(pipelineId, new AppRequest<>(APP_ARTIFACT, pipelineConfig));
    // write some files that will be moved to the input fileset
    DataSetManager<FileSet> moveFromManager = getDataset(moveFromName);
    // this file starts with '.' and should be ignored.
    Location invisibleFile = moveFromManager.get().getBaseLocation().append(".hidden");
    try (OutputStream outputStream = invisibleFile.getOutputStream()) {
        outputStream.write(Bytes.toBytes("this should not be read"));
    }
    // this file should be moved
    String line1 = "Hello World!";
    String line2 = "Good to meet you";
    String line3 = "My name is Hal";
    String inputText = line1 + "\n" + line2 + "\n" + line3;
    Location inputFile = moveFromManager.get().getBaseLocation().append("inputFile");
    try (OutputStream outputStream = inputFile.getOutputStream()) {
        outputStream.write(Bytes.toBytes(inputText));
    }
    // run the pipeline
    Map<String, String> runtimeArgs = new HashMap<>();
    // the ${file} macro will be substituted with "inputFile" for our pipeline run
    runtimeArgs.put("file", "inputFile");
    WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
    workflowManager.start(runtimeArgs);
    workflowManager.waitForFinish(4, TimeUnit.MINUTES);
    // check the pipeline output
    DataSetManager<Table> outputManager = getDataset(outputName);
    Set<StructuredRecord> outputRecords = new HashSet<>();
    outputRecords.addAll(MockSink.readOutput(outputManager));
    Set<StructuredRecord> expected = new HashSet<>();
    expected.add(StructuredRecord.builder(TextFileSetSource.OUTPUT_SCHEMA).set("position", (long) inputText.indexOf(line1)).set("text", line1).build());
    expected.add(StructuredRecord.builder(TextFileSetSource.OUTPUT_SCHEMA).set("position", (long) inputText.indexOf(line2)).set("text", line2).build());
    expected.add(StructuredRecord.builder(TextFileSetSource.OUTPUT_SCHEMA).set("position", (long) inputText.indexOf(line3)).set("text", line3).build());
    Assert.assertEquals(expected, outputRecords);
    // check that the input file does not exist in the moveFrom fileSet,
    // and was deleted by the source in the input fileSet
    Assert.assertFalse(moveFromManager.get().getBaseLocation().append("inputFile").exists());
    DataSetManager<FileSet> inputManager = getDataset(inputName);
    Assert.assertFalse(inputManager.get().getBaseLocation().append("inputFile").exists());
}
Also used : ApplicationManager(co.cask.cdap.test.ApplicationManager) Table(co.cask.cdap.api.dataset.table.Table) KeyValueTable(co.cask.cdap.api.dataset.lib.KeyValueTable) FileSet(co.cask.cdap.api.dataset.lib.FileSet) HashMap(java.util.HashMap) OutputStream(java.io.OutputStream) WorkflowManager(co.cask.cdap.test.WorkflowManager) ETLPlugin(co.cask.cdap.etl.proto.v2.ETLPlugin) StructuredRecord(co.cask.cdap.api.data.format.StructuredRecord) ETLBatchConfig(co.cask.cdap.etl.proto.v2.ETLBatchConfig) ETLStage(co.cask.cdap.etl.proto.v2.ETLStage) ApplicationId(co.cask.cdap.proto.id.ApplicationId) Location(org.apache.twill.filesystem.Location) HashSet(java.util.HashSet) Test(org.junit.Test)

Example 59 with WorkflowManager

use of co.cask.cdap.test.WorkflowManager in project cdap by caskdata.

the class PipelineTest method testStringCaseTransform.

@Test
public void testStringCaseTransform() throws Exception {
    String inputName = "transformTestInput";
    String outputName = "transformTestOutput";
    // create the pipeline config
    ETLStage source = new ETLStage("source", MockSource.getPlugin(inputName));
    ETLStage sink = new ETLStage("sink", MockSink.getPlugin(outputName));
    Map<String, String> transformProperties = new HashMap<>();
    transformProperties.put("lowerFields", "first");
    transformProperties.put("upperFields", "last");
    ETLStage transform = new ETLStage("transform", new ETLPlugin(StringCaseTransform.NAME, Transform.PLUGIN_TYPE, transformProperties, null));
    ETLBatchConfig pipelineConfig = ETLBatchConfig.builder("* * * * *").addStage(source).addStage(sink).addStage(transform).addConnection(source.getName(), transform.getName()).addConnection(transform.getName(), sink.getName()).build();
    // create the pipeline
    ApplicationId pipelineId = NamespaceId.DEFAULT.app("transformTestPipeline");
    ApplicationManager appManager = deployApplication(pipelineId, new AppRequest<>(APP_ARTIFACT, pipelineConfig));
    // write the input
    Schema schema = Schema.recordOf("name", Schema.Field.of("first", Schema.of(Schema.Type.STRING)), Schema.Field.of("last", Schema.of(Schema.Type.STRING)));
    DataSetManager<Table> inputManager = getDataset(inputName);
    List<StructuredRecord> inputRecords = new ArrayList<>();
    inputRecords.add(StructuredRecord.builder(schema).set("first", "Samuel").set("last", "Jackson").build());
    MockSource.writeInput(inputManager, inputRecords);
    WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
    workflowManager.start();
    workflowManager.waitForFinish(4, TimeUnit.MINUTES);
    DataSetManager<Table> outputManager = getDataset(outputName);
    List<StructuredRecord> outputRecords = MockSink.readOutput(outputManager);
    List<StructuredRecord> expected = new ArrayList<>();
    expected.add(StructuredRecord.builder(schema).set("first", "samuel").set("last", "JACKSON").build());
    Assert.assertEquals(expected, outputRecords);
}
Also used : ApplicationManager(co.cask.cdap.test.ApplicationManager) Table(co.cask.cdap.api.dataset.table.Table) KeyValueTable(co.cask.cdap.api.dataset.lib.KeyValueTable) HashMap(java.util.HashMap) Schema(co.cask.cdap.api.data.schema.Schema) WorkflowManager(co.cask.cdap.test.WorkflowManager) ArrayList(java.util.ArrayList) ETLPlugin(co.cask.cdap.etl.proto.v2.ETLPlugin) StructuredRecord(co.cask.cdap.api.data.format.StructuredRecord) ETLBatchConfig(co.cask.cdap.etl.proto.v2.ETLBatchConfig) ETLStage(co.cask.cdap.etl.proto.v2.ETLStage) ApplicationId(co.cask.cdap.proto.id.ApplicationId) Test(org.junit.Test)

Aggregations

WorkflowManager (co.cask.cdap.test.WorkflowManager)59 ApplicationManager (co.cask.cdap.test.ApplicationManager)57 ApplicationId (co.cask.cdap.proto.id.ApplicationId)46 ETLBatchConfig (co.cask.cdap.etl.proto.v2.ETLBatchConfig)44 ETLStage (co.cask.cdap.etl.proto.v2.ETLStage)44 KeyValueTable (co.cask.cdap.api.dataset.lib.KeyValueTable)42 AppRequest (co.cask.cdap.proto.artifact.AppRequest)41 Table (co.cask.cdap.api.dataset.table.Table)39 Test (org.junit.Test)35 StructuredRecord (co.cask.cdap.api.data.format.StructuredRecord)34 Schema (co.cask.cdap.api.data.schema.Schema)31 HashMap (java.util.HashMap)12 RunRecord (co.cask.cdap.proto.RunRecord)9 ArrayList (java.util.ArrayList)9 ETLPlugin (co.cask.cdap.etl.proto.v2.ETLPlugin)7 HashSet (java.util.HashSet)7 ConflictException (co.cask.cdap.common.ConflictException)6 File (java.io.File)6 IOException (java.io.IOException)6 TimeoutException (java.util.concurrent.TimeoutException)6