Search in sources :

Example 31 with ETLBatchConfig

use of co.cask.cdap.etl.proto.v2.ETLBatchConfig in project cdap by caskdata.

the class PipelineSpecGeneratorTest method testConflictingInputErrorSchemas.

@Test(expected = IllegalArgumentException.class)
public void testConflictingInputErrorSchemas() {
    /*
     *           ---- transformA
     *           |        |
     * source ---|        |------- error -- sink
     *           |        |
     *           ---- transformB
     *
     * error gets schema B from transformA and schema A from transformB, should fail
     */
    ETLBatchConfig etlConfig = ETLBatchConfig.builder("* * * * *").addStage(new ETLStage("source", MOCK_SOURCE)).addStage(new ETLStage("sink", MOCK_SINK)).addStage(new ETLStage("tA", MOCK_TRANSFORM_A)).addStage(new ETLStage("tB", MOCK_TRANSFORM_B)).addStage(new ETLStage("error", MOCK_ERROR)).addConnection("source", "tA").addConnection("source", "tB").addConnection("tA", "error").addConnection("tB", "error").addConnection("error", "sink").build();
    specGenerator.generateSpec(etlConfig);
}
Also used : ETLBatchConfig(co.cask.cdap.etl.proto.v2.ETLBatchConfig) ETLStage(co.cask.cdap.etl.proto.v2.ETLStage) Test(org.junit.Test)

Example 32 with ETLBatchConfig

use of co.cask.cdap.etl.proto.v2.ETLBatchConfig in project cdap by caskdata.

the class PipelineSpecGeneratorTest method testConnectionOutOfSink.

@Test(expected = IllegalArgumentException.class)
public void testConnectionOutOfSink() {
    ETLBatchConfig etlConfig = ETLBatchConfig.builder("* * * * *").addStage(new ETLStage("source", MOCK_SOURCE)).addStage(new ETLStage("sink", MOCK_SINK)).addStage(new ETLStage("transform", MOCK_TRANSFORM_A)).addConnection("source", "sink").addConnection("sink", "transform").build();
    specGenerator.generateSpec(etlConfig);
}
Also used : ETLBatchConfig(co.cask.cdap.etl.proto.v2.ETLBatchConfig) ETLStage(co.cask.cdap.etl.proto.v2.ETLStage) Test(org.junit.Test)

Example 33 with ETLBatchConfig

use of co.cask.cdap.etl.proto.v2.ETLBatchConfig in project cdap by caskdata.

the class PipelineTest method testWordCountSparkSink.

@SuppressWarnings("ConstantConditions")
@Test
public void testWordCountSparkSink() throws Exception {
    String inputName = "sparkSinkInput";
    String outputName = "sparkSinkOutput";
    // create the pipeline config
    ETLStage source = new ETLStage("source", MockSource.getPlugin(inputName));
    Map<String, String> sinkProperties = new HashMap<>();
    sinkProperties.put("field", "text");
    sinkProperties.put("tableName", outputName);
    ETLStage sink = new ETLStage("sink", new ETLPlugin(WordCountSink.NAME, SparkSink.PLUGIN_TYPE, sinkProperties, null));
    ETLBatchConfig pipelineConfig = ETLBatchConfig.builder("* * * * *").addStage(source).addStage(sink).addConnection(source.getName(), sink.getName()).build();
    // create the pipeline
    ApplicationId pipelineId = NamespaceId.DEFAULT.app("sparkSinkTestPipeline");
    ApplicationManager appManager = deployApplication(pipelineId, new AppRequest<>(APP_ARTIFACT, pipelineConfig));
    // write the input
    Schema inputSchema = Schema.recordOf("text", Schema.Field.of("text", Schema.of(Schema.Type.STRING)));
    DataSetManager<Table> inputManager = getDataset(inputName);
    List<StructuredRecord> inputRecords = new ArrayList<>();
    inputRecords.add(StructuredRecord.builder(inputSchema).set("text", "Hello World").build());
    inputRecords.add(StructuredRecord.builder(inputSchema).set("text", "Hello my name is Hal").build());
    inputRecords.add(StructuredRecord.builder(inputSchema).set("text", "Hello my name is Sam").build());
    MockSource.writeInput(inputManager, inputRecords);
    WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
    workflowManager.start();
    workflowManager.waitForFinish(4, TimeUnit.MINUTES);
    DataSetManager<KeyValueTable> outputManager = getDataset(outputName);
    KeyValueTable output = outputManager.get();
    Assert.assertEquals(3L, Bytes.toLong(output.read("Hello")));
    Assert.assertEquals(1L, Bytes.toLong(output.read("World")));
    Assert.assertEquals(2L, Bytes.toLong(output.read("my")));
    Assert.assertEquals(2L, Bytes.toLong(output.read("name")));
    Assert.assertEquals(2L, Bytes.toLong(output.read("is")));
    Assert.assertEquals(1L, Bytes.toLong(output.read("Hal")));
    Assert.assertEquals(1L, Bytes.toLong(output.read("Sam")));
}
Also used : ApplicationManager(co.cask.cdap.test.ApplicationManager) Table(co.cask.cdap.api.dataset.table.Table) KeyValueTable(co.cask.cdap.api.dataset.lib.KeyValueTable) HashMap(java.util.HashMap) Schema(co.cask.cdap.api.data.schema.Schema) WorkflowManager(co.cask.cdap.test.WorkflowManager) ArrayList(java.util.ArrayList) ETLPlugin(co.cask.cdap.etl.proto.v2.ETLPlugin) StructuredRecord(co.cask.cdap.api.data.format.StructuredRecord) ETLBatchConfig(co.cask.cdap.etl.proto.v2.ETLBatchConfig) ETLStage(co.cask.cdap.etl.proto.v2.ETLStage) KeyValueTable(co.cask.cdap.api.dataset.lib.KeyValueTable) ApplicationId(co.cask.cdap.proto.id.ApplicationId) Test(org.junit.Test)

Example 34 with ETLBatchConfig

use of co.cask.cdap.etl.proto.v2.ETLBatchConfig in project cdap by caskdata.

the class PipelineTest method testStringCaseTransform.

@Test
public void testStringCaseTransform() throws Exception {
    String inputName = "transformTestInput";
    String outputName = "transformTestOutput";
    // create the pipeline config
    ETLStage source = new ETLStage("source", MockSource.getPlugin(inputName));
    ETLStage sink = new ETLStage("sink", MockSink.getPlugin(outputName));
    Map<String, String> transformProperties = new HashMap<>();
    transformProperties.put("lowerFields", "first");
    transformProperties.put("upperFields", "last");
    ETLStage transform = new ETLStage("transform", new ETLPlugin(StringCaseTransform.NAME, Transform.PLUGIN_TYPE, transformProperties, null));
    ETLBatchConfig pipelineConfig = ETLBatchConfig.builder("* * * * *").addStage(source).addStage(sink).addStage(transform).addConnection(source.getName(), transform.getName()).addConnection(transform.getName(), sink.getName()).build();
    // create the pipeline
    ApplicationId pipelineId = NamespaceId.DEFAULT.app("transformTestPipeline");
    ApplicationManager appManager = deployApplication(pipelineId, new AppRequest<>(APP_ARTIFACT, pipelineConfig));
    // write the input
    Schema schema = Schema.recordOf("name", Schema.Field.of("first", Schema.of(Schema.Type.STRING)), Schema.Field.of("last", Schema.of(Schema.Type.STRING)));
    DataSetManager<Table> inputManager = getDataset(inputName);
    List<StructuredRecord> inputRecords = new ArrayList<>();
    inputRecords.add(StructuredRecord.builder(schema).set("first", "Samuel").set("last", "Jackson").build());
    MockSource.writeInput(inputManager, inputRecords);
    WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
    workflowManager.start();
    workflowManager.waitForFinish(4, TimeUnit.MINUTES);
    DataSetManager<Table> outputManager = getDataset(outputName);
    List<StructuredRecord> outputRecords = MockSink.readOutput(outputManager);
    List<StructuredRecord> expected = new ArrayList<>();
    expected.add(StructuredRecord.builder(schema).set("first", "samuel").set("last", "JACKSON").build());
    Assert.assertEquals(expected, outputRecords);
}
Also used : ApplicationManager(co.cask.cdap.test.ApplicationManager) Table(co.cask.cdap.api.dataset.table.Table) KeyValueTable(co.cask.cdap.api.dataset.lib.KeyValueTable) HashMap(java.util.HashMap) Schema(co.cask.cdap.api.data.schema.Schema) WorkflowManager(co.cask.cdap.test.WorkflowManager) ArrayList(java.util.ArrayList) ETLPlugin(co.cask.cdap.etl.proto.v2.ETLPlugin) StructuredRecord(co.cask.cdap.api.data.format.StructuredRecord) ETLBatchConfig(co.cask.cdap.etl.proto.v2.ETLBatchConfig) ETLStage(co.cask.cdap.etl.proto.v2.ETLStage) ApplicationId(co.cask.cdap.proto.id.ApplicationId) Test(org.junit.Test)

Example 35 with ETLBatchConfig

use of co.cask.cdap.etl.proto.v2.ETLBatchConfig in project cdap by caskdata.

the class PipelineTest method testTextFileSourceAndMoveAction.

@Test
public void testTextFileSourceAndMoveAction() throws Exception {
    // create the pipeline config
    String moveFromName = "sourceTestMoveFrom";
    String inputName = "sourceTestInput";
    String outputName = "sourceTestOutput";
    Map<String, String> actionProperties = new HashMap<>();
    actionProperties.put(FilesetMoveAction.Conf.SOURCE_FILESET, "sourceTestMoveFrom");
    actionProperties.put(FilesetMoveAction.Conf.DEST_FILESET, inputName);
    ETLStage moveAction = new ETLStage("moveInput", new ETLPlugin(FilesetMoveAction.NAME, Action.PLUGIN_TYPE, actionProperties, null));
    Map<String, String> sourceProperties = new HashMap<>();
    sourceProperties.put(TextFileSetSource.Conf.FILESET_NAME, inputName);
    sourceProperties.put(TextFileSetSource.Conf.CREATE_IF_NOT_EXISTS, "true");
    sourceProperties.put(TextFileSetSource.Conf.DELETE_INPUT_ON_SUCCESS, "true");
    sourceProperties.put(TextFileSetSource.Conf.FILES, "${file}");
    ETLStage source = new ETLStage("source", new ETLPlugin(TextFileSetSource.NAME, BatchSource.PLUGIN_TYPE, sourceProperties, null));
    ETLStage sink = new ETLStage("sink", MockSink.getPlugin(outputName));
    ETLBatchConfig pipelineConfig = ETLBatchConfig.builder("* * * * *").addStage(source).addStage(sink).addStage(moveAction).addConnection(moveAction.getName(), source.getName()).addConnection(source.getName(), sink.getName()).build();
    // create the move from fileset
    addDatasetInstance(FileSet.class.getName(), moveFromName);
    // create the pipeline
    ApplicationId pipelineId = NamespaceId.DEFAULT.app("textSourceTestPipeline");
    ApplicationManager appManager = deployApplication(pipelineId, new AppRequest<>(APP_ARTIFACT, pipelineConfig));
    // write some files that will be moved to the input fileset
    DataSetManager<FileSet> moveFromManager = getDataset(moveFromName);
    // this file starts with '.' and should be ignored.
    Location invisibleFile = moveFromManager.get().getBaseLocation().append(".hidden");
    try (OutputStream outputStream = invisibleFile.getOutputStream()) {
        outputStream.write(Bytes.toBytes("this should not be read"));
    }
    // this file should be moved
    String line1 = "Hello World!";
    String line2 = "Good to meet you";
    String line3 = "My name is Hal";
    String inputText = line1 + "\n" + line2 + "\n" + line3;
    Location inputFile = moveFromManager.get().getBaseLocation().append("inputFile");
    try (OutputStream outputStream = inputFile.getOutputStream()) {
        outputStream.write(Bytes.toBytes(inputText));
    }
    // run the pipeline
    Map<String, String> runtimeArgs = new HashMap<>();
    // the ${file} macro will be substituted with "inputFile" for our pipeline run
    runtimeArgs.put("file", "inputFile");
    WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
    workflowManager.start(runtimeArgs);
    workflowManager.waitForFinish(4, TimeUnit.MINUTES);
    // check the pipeline output
    DataSetManager<Table> outputManager = getDataset(outputName);
    Set<StructuredRecord> outputRecords = new HashSet<>();
    outputRecords.addAll(MockSink.readOutput(outputManager));
    Set<StructuredRecord> expected = new HashSet<>();
    expected.add(StructuredRecord.builder(TextFileSetSource.OUTPUT_SCHEMA).set("position", (long) inputText.indexOf(line1)).set("text", line1).build());
    expected.add(StructuredRecord.builder(TextFileSetSource.OUTPUT_SCHEMA).set("position", (long) inputText.indexOf(line2)).set("text", line2).build());
    expected.add(StructuredRecord.builder(TextFileSetSource.OUTPUT_SCHEMA).set("position", (long) inputText.indexOf(line3)).set("text", line3).build());
    Assert.assertEquals(expected, outputRecords);
    // check that the input file does not exist in the moveFrom fileSet,
    // and was deleted by the source in the input fileSet
    Assert.assertFalse(moveFromManager.get().getBaseLocation().append("inputFile").exists());
    DataSetManager<FileSet> inputManager = getDataset(inputName);
    Assert.assertFalse(inputManager.get().getBaseLocation().append("inputFile").exists());
}
Also used : ApplicationManager(co.cask.cdap.test.ApplicationManager) Table(co.cask.cdap.api.dataset.table.Table) KeyValueTable(co.cask.cdap.api.dataset.lib.KeyValueTable) FileSet(co.cask.cdap.api.dataset.lib.FileSet) HashMap(java.util.HashMap) OutputStream(java.io.OutputStream) WorkflowManager(co.cask.cdap.test.WorkflowManager) ETLPlugin(co.cask.cdap.etl.proto.v2.ETLPlugin) StructuredRecord(co.cask.cdap.api.data.format.StructuredRecord) ETLBatchConfig(co.cask.cdap.etl.proto.v2.ETLBatchConfig) ETLStage(co.cask.cdap.etl.proto.v2.ETLStage) ApplicationId(co.cask.cdap.proto.id.ApplicationId) Location(org.apache.twill.filesystem.Location) HashSet(java.util.HashSet) Test(org.junit.Test)

Aggregations

ETLBatchConfig (co.cask.cdap.etl.proto.v2.ETLBatchConfig)47 ETLStage (co.cask.cdap.etl.proto.v2.ETLStage)46 ApplicationId (co.cask.cdap.proto.id.ApplicationId)32 ApplicationManager (co.cask.cdap.test.ApplicationManager)30 WorkflowManager (co.cask.cdap.test.WorkflowManager)30 AppRequest (co.cask.cdap.proto.artifact.AppRequest)27 Test (org.junit.Test)27 Table (co.cask.cdap.api.dataset.table.Table)26 StructuredRecord (co.cask.cdap.api.data.format.StructuredRecord)25 KeyValueTable (co.cask.cdap.api.dataset.lib.KeyValueTable)24 Schema (co.cask.cdap.api.data.schema.Schema)22 ETLPlugin (co.cask.cdap.etl.proto.v2.ETLPlugin)9 ArrayList (java.util.ArrayList)7 HashMap (java.util.HashMap)7 HashSet (java.util.HashSet)6 BatchPipelineSpec (co.cask.cdap.etl.batch.BatchPipelineSpec)4 Resources (co.cask.cdap.api.Resources)2 FileSet (co.cask.cdap.api.dataset.lib.FileSet)2 PreviewManager (co.cask.cdap.app.preview.PreviewManager)2 PreviewRunner (co.cask.cdap.app.preview.PreviewRunner)2