Search in sources :

Example 1 with ETLPlugin

use of co.cask.cdap.etl.proto.v2.ETLPlugin in project cdap by caskdata.

the class PipelineSpecGeneratorTest method testPipelineProperties.

@Test
public void testPipelineProperties() {
    // populate some mock plugins.
    MockPluginConfigurer pluginConfigurer = new MockPluginConfigurer();
    Set<ArtifactId> artifactIds = ImmutableSet.of(ARTIFACT_ID);
    pluginConfigurer.addMockPlugin(Action.PLUGIN_TYPE, "action1", MockPlugin.builder().putPipelineProperty("prop1", "val1").putPipelineProperty("prop2", "val2").build(), artifactIds);
    pluginConfigurer.addMockPlugin(Action.PLUGIN_TYPE, "action2", MockPlugin.builder().putPipelineProperty("prop2", "val2").build(), artifactIds);
    PipelineSpecGenerator specGenerator = new BatchPipelineSpecGenerator(pluginConfigurer, ImmutableSet.of(BatchSource.PLUGIN_TYPE), ImmutableSet.of(BatchSink.PLUGIN_TYPE), FileSet.class, DatasetProperties.EMPTY, Engine.MAPREDUCE);
    Map<String, String> empty = ImmutableMap.of();
    ETLConfig config = ETLBatchConfig.builder("* * * * *").setProperties(ImmutableMap.of("system.spark.spark.test", "abc", "system.mapreduce.prop3", "val3")).addStage(new ETLStage("a1", new ETLPlugin("action1", Action.PLUGIN_TYPE, empty))).addStage(new ETLStage("a2", new ETLPlugin("action2", Action.PLUGIN_TYPE, empty))).addConnection("a1", "a2").setEngine(Engine.MAPREDUCE).build();
    PipelineSpec actual = specGenerator.generateSpec(config);
    PipelineSpec expected = BatchPipelineSpec.builder().addConnection("a1", "a2").setProperties(ImmutableMap.of("prop1", "val1", "prop2", "val2", "prop3", "val3")).addStage(StageSpec.builder("a1", new PluginSpec(Action.PLUGIN_TYPE, "action1", empty, ARTIFACT_ID)).addOutputs("a2").build()).addStage(StageSpec.builder("a2", new PluginSpec(Action.PLUGIN_TYPE, "action2", empty, ARTIFACT_ID)).addInputs("a1").build()).setResources(new Resources(1024)).setDriverResources(new Resources(1024)).setClientResources(new Resources(1024)).build();
    Assert.assertEquals(expected, actual);
}
Also used : ArtifactId(co.cask.cdap.api.artifact.ArtifactId) BatchPipelineSpecGenerator(co.cask.cdap.etl.batch.BatchPipelineSpecGenerator) ETLPlugin(co.cask.cdap.etl.proto.v2.ETLPlugin) MockPluginConfigurer(co.cask.cdap.etl.common.MockPluginConfigurer) ETLConfig(co.cask.cdap.etl.proto.v2.ETLConfig) ETLStage(co.cask.cdap.etl.proto.v2.ETLStage) BatchPipelineSpec(co.cask.cdap.etl.batch.BatchPipelineSpec) BatchPipelineSpecGenerator(co.cask.cdap.etl.batch.BatchPipelineSpecGenerator) Resources(co.cask.cdap.api.Resources) Test(org.junit.Test)

Example 2 with ETLPlugin

use of co.cask.cdap.etl.proto.v2.ETLPlugin in project cdap by caskdata.

the class PipelineTest method testWordCount.

public void testWordCount(String pluginType) throws Exception {
    String inputName = "wcInput-" + pluginType;
    String outputName = "wcOutput-" + pluginType;
    // create the pipeline config
    ETLStage source = new ETLStage("wcInput", MockSource.getPlugin(inputName));
    ETLStage sink = new ETLStage("wcOutput", MockSink.getPlugin(outputName));
    Map<String, String> aggProperties = new HashMap<>();
    aggProperties.put("field", "text");
    ETLStage agg = new ETLStage("middle", new ETLPlugin("WordCount", pluginType, aggProperties, null));
    ETLBatchConfig pipelineConfig = ETLBatchConfig.builder("* * * * *").addStage(source).addStage(sink).addStage(agg).addConnection(source.getName(), agg.getName()).addConnection(agg.getName(), sink.getName()).build();
    // create the pipeline
    ApplicationId pipelineId = NamespaceId.DEFAULT.app("wcTestPipeline-" + pluginType);
    ApplicationManager appManager = deployApplication(pipelineId, new AppRequest<>(APP_ARTIFACT, pipelineConfig));
    // write the input
    Schema inputSchema = Schema.recordOf("text", Schema.Field.of("text", Schema.of(Schema.Type.STRING)));
    DataSetManager<Table> inputManager = getDataset(inputName);
    List<StructuredRecord> inputRecords = new ArrayList<>();
    inputRecords.add(StructuredRecord.builder(inputSchema).set("text", "Hello World").build());
    inputRecords.add(StructuredRecord.builder(inputSchema).set("text", "Hello my name is Hal").build());
    inputRecords.add(StructuredRecord.builder(inputSchema).set("text", "Hello my name is Sam").build());
    MockSource.writeInput(inputManager, inputRecords);
    WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
    workflowManager.start();
    workflowManager.waitForFinish(4, TimeUnit.MINUTES);
    DataSetManager<Table> outputManager = getDataset(outputName);
    Set<StructuredRecord> outputRecords = new HashSet<>();
    outputRecords.addAll(MockSink.readOutput(outputManager));
    Set<StructuredRecord> expected = new HashSet<>();
    expected.add(StructuredRecord.builder(WordCountAggregator.OUTPUT_SCHEMA).set("word", "Hello").set("count", 3L).build());
    expected.add(StructuredRecord.builder(WordCountAggregator.OUTPUT_SCHEMA).set("word", "World").set("count", 1L).build());
    expected.add(StructuredRecord.builder(WordCountAggregator.OUTPUT_SCHEMA).set("word", "my").set("count", 2L).build());
    expected.add(StructuredRecord.builder(WordCountAggregator.OUTPUT_SCHEMA).set("word", "name").set("count", 2L).build());
    expected.add(StructuredRecord.builder(WordCountAggregator.OUTPUT_SCHEMA).set("word", "is").set("count", 2L).build());
    expected.add(StructuredRecord.builder(WordCountAggregator.OUTPUT_SCHEMA).set("word", "Hal").set("count", 1L).build());
    expected.add(StructuredRecord.builder(WordCountAggregator.OUTPUT_SCHEMA).set("word", "Sam").set("count", 1L).build());
    Assert.assertEquals(expected, outputRecords);
}
Also used : ApplicationManager(co.cask.cdap.test.ApplicationManager) Table(co.cask.cdap.api.dataset.table.Table) KeyValueTable(co.cask.cdap.api.dataset.lib.KeyValueTable) HashMap(java.util.HashMap) Schema(co.cask.cdap.api.data.schema.Schema) WorkflowManager(co.cask.cdap.test.WorkflowManager) ArrayList(java.util.ArrayList) ETLPlugin(co.cask.cdap.etl.proto.v2.ETLPlugin) StructuredRecord(co.cask.cdap.api.data.format.StructuredRecord) ETLBatchConfig(co.cask.cdap.etl.proto.v2.ETLBatchConfig) ETLStage(co.cask.cdap.etl.proto.v2.ETLStage) ApplicationId(co.cask.cdap.proto.id.ApplicationId) HashSet(java.util.HashSet)

Example 3 with ETLPlugin

use of co.cask.cdap.etl.proto.v2.ETLPlugin in project cdap by caskdata.

the class PipelineSpecGeneratorTest method testConflictingPipelineProperties.

@Test(expected = IllegalArgumentException.class)
public void testConflictingPipelineProperties() {
    // populate some mock plugins.
    MockPluginConfigurer pluginConfigurer = new MockPluginConfigurer();
    Set<ArtifactId> artifactIds = ImmutableSet.of(ARTIFACT_ID);
    pluginConfigurer.addMockPlugin(Action.PLUGIN_TYPE, "action1", MockPlugin.builder().putPipelineProperty("prop1", "val1").build(), artifactIds);
    pluginConfigurer.addMockPlugin(Action.PLUGIN_TYPE, "action2", MockPlugin.builder().putPipelineProperty("prop1", "val2").build(), artifactIds);
    PipelineSpecGenerator specGenerator = new BatchPipelineSpecGenerator(pluginConfigurer, ImmutableSet.of(BatchSource.PLUGIN_TYPE), ImmutableSet.of(BatchSink.PLUGIN_TYPE), FileSet.class, DatasetProperties.EMPTY, Engine.MAPREDUCE);
    Map<String, String> empty = ImmutableMap.of();
    ETLConfig config = ETLBatchConfig.builder("* * * * *").addStage(new ETLStage("a1", new ETLPlugin("action1", Action.PLUGIN_TYPE, empty))).addStage(new ETLStage("a2", new ETLPlugin("action2", Action.PLUGIN_TYPE, empty))).addConnection("a1", "a2").setEngine(Engine.MAPREDUCE).build();
    specGenerator.generateSpec(config);
}
Also used : ArtifactId(co.cask.cdap.api.artifact.ArtifactId) BatchPipelineSpecGenerator(co.cask.cdap.etl.batch.BatchPipelineSpecGenerator) ETLStage(co.cask.cdap.etl.proto.v2.ETLStage) ETLPlugin(co.cask.cdap.etl.proto.v2.ETLPlugin) BatchPipelineSpecGenerator(co.cask.cdap.etl.batch.BatchPipelineSpecGenerator) MockPluginConfigurer(co.cask.cdap.etl.common.MockPluginConfigurer) ETLConfig(co.cask.cdap.etl.proto.v2.ETLConfig) Test(org.junit.Test)

Example 4 with ETLPlugin

use of co.cask.cdap.etl.proto.v2.ETLPlugin in project cdap by caskdata.

the class DataPipelineTest method testSinglePhaseWithSparkSink.

private void testSinglePhaseWithSparkSink() throws Exception {
    /*
     * source1 ---|
     *            |--> sparksink
     * source2 ---|
     */
    ETLBatchConfig etlConfig = ETLBatchConfig.builder("* * * * *").addStage(new ETLStage("source1", MockSource.getPlugin("messages1", SpamMessage.SCHEMA))).addStage(new ETLStage("source2", MockSource.getPlugin("messages2", SpamMessage.SCHEMA))).addStage(new ETLStage("customsink", new ETLPlugin(NaiveBayesTrainer.PLUGIN_NAME, SparkSink.PLUGIN_TYPE, ImmutableMap.of("fileSetName", "modelFileSet", "path", "output", "fieldToClassify", SpamMessage.TEXT_FIELD, "predictionField", SpamMessage.SPAM_PREDICTION_FIELD), null))).addConnection("source1", "customsink").addConnection("source2", "customsink").build();
    AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
    ApplicationId appId = NamespaceId.DEFAULT.app("SparkSinkApp");
    ApplicationManager appManager = deployApplication(appId.toId(), appRequest);
    // set up five spam messages and five non-spam messages to be used for classification
    List<StructuredRecord> messagesToWrite = new ArrayList<>();
    messagesToWrite.add(new SpamMessage("buy our clothes", 1.0).toStructuredRecord());
    messagesToWrite.add(new SpamMessage("sell your used books to us", 1.0).toStructuredRecord());
    messagesToWrite.add(new SpamMessage("earn money for free", 1.0).toStructuredRecord());
    messagesToWrite.add(new SpamMessage("this is definitely not spam", 1.0).toStructuredRecord());
    messagesToWrite.add(new SpamMessage("you won the lottery", 1.0).toStructuredRecord());
    // write records to source1
    DataSetManager<Table> inputManager = getDataset(NamespaceId.DEFAULT.dataset("messages1"));
    MockSource.writeInput(inputManager, messagesToWrite);
    messagesToWrite.clear();
    messagesToWrite.add(new SpamMessage("how was your day", 0.0).toStructuredRecord());
    messagesToWrite.add(new SpamMessage("what are you up to", 0.0).toStructuredRecord());
    messagesToWrite.add(new SpamMessage("this is a genuine message", 0.0).toStructuredRecord());
    messagesToWrite.add(new SpamMessage("this is an even more genuine message", 0.0).toStructuredRecord());
    messagesToWrite.add(new SpamMessage("could you send me the report", 0.0).toStructuredRecord());
    // write records to source2
    inputManager = getDataset(NamespaceId.DEFAULT.dataset("messages2"));
    MockSource.writeInput(inputManager, messagesToWrite);
    // ingest in some messages to be classified
    StreamManager textsToClassify = getStreamManager(NaiveBayesTrainer.TEXTS_TO_CLASSIFY);
    textsToClassify.send("how are you doing today");
    textsToClassify.send("free money money");
    textsToClassify.send("what are you doing today");
    textsToClassify.send("genuine report");
    // manually trigger the pipeline
    WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
    workflowManager.start();
    workflowManager.waitForRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
    DataSetManager<KeyValueTable> classifiedTexts = getDataset(NaiveBayesTrainer.CLASSIFIED_TEXTS);
    Assert.assertEquals(0.0d, Bytes.toDouble(classifiedTexts.get().read("how are you doing today")), 0.01d);
    // only 'free money money' should be predicated as spam
    Assert.assertEquals(1.0d, Bytes.toDouble(classifiedTexts.get().read("free money money")), 0.01d);
    Assert.assertEquals(0.0d, Bytes.toDouble(classifiedTexts.get().read("what are you doing today")), 0.01d);
    Assert.assertEquals(0.0d, Bytes.toDouble(classifiedTexts.get().read("genuine report")), 0.01d);
    validateMetric(5, appId, "source1.records.out");
    validateMetric(5, appId, "source2.records.out");
    validateMetric(10, appId, "customsink.records.in");
}
Also used : ApplicationManager(co.cask.cdap.test.ApplicationManager) KeyValueTable(co.cask.cdap.api.dataset.lib.KeyValueTable) Table(co.cask.cdap.api.dataset.table.Table) SpamMessage(co.cask.cdap.datapipeline.mock.SpamMessage) WorkflowManager(co.cask.cdap.test.WorkflowManager) ArrayList(java.util.ArrayList) ETLPlugin(co.cask.cdap.etl.proto.v2.ETLPlugin) StructuredRecord(co.cask.cdap.api.data.format.StructuredRecord) AppRequest(co.cask.cdap.proto.artifact.AppRequest) ETLBatchConfig(co.cask.cdap.etl.proto.v2.ETLBatchConfig) ETLStage(co.cask.cdap.etl.proto.v2.ETLStage) StreamManager(co.cask.cdap.test.StreamManager) KeyValueTable(co.cask.cdap.api.dataset.lib.KeyValueTable) ApplicationId(co.cask.cdap.proto.id.ApplicationId)

Example 5 with ETLPlugin

use of co.cask.cdap.etl.proto.v2.ETLPlugin in project cdap by caskdata.

the class DataPipelineTest method testSinglePhaseWithSparkCompute.

private void testSinglePhaseWithSparkCompute() throws Exception {
    /*
     * source --> sparkcompute --> sink
     */
    String classifiedTextsTable = "classifiedTextTable";
    ETLBatchConfig etlConfig = ETLBatchConfig.builder("* * * * *").addStage(new ETLStage("source", MockSource.getPlugin(NaiveBayesTrainer.TEXTS_TO_CLASSIFY, SpamMessage.SCHEMA))).addStage(new ETLStage("sparkcompute", new ETLPlugin(NaiveBayesClassifier.PLUGIN_NAME, SparkCompute.PLUGIN_TYPE, ImmutableMap.of("fileSetName", "modelFileSet", "path", "output", "fieldToClassify", SpamMessage.TEXT_FIELD, "fieldToSet", SpamMessage.SPAM_PREDICTION_FIELD), null))).addStage(new ETLStage("sink", MockSink.getPlugin(classifiedTextsTable))).addConnection("source", "sparkcompute").addConnection("sparkcompute", "sink").build();
    AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
    ApplicationId appId = NamespaceId.DEFAULT.app("SparkComputeApp");
    ApplicationManager appManager = deployApplication(appId.toId(), appRequest);
    // write some some messages to be classified
    List<StructuredRecord> messagesToWrite = new ArrayList<>();
    messagesToWrite.add(new SpamMessage("how are you doing today").toStructuredRecord());
    messagesToWrite.add(new SpamMessage("free money money").toStructuredRecord());
    messagesToWrite.add(new SpamMessage("what are you doing today").toStructuredRecord());
    messagesToWrite.add(new SpamMessage("genuine report").toStructuredRecord());
    DataSetManager<Table> inputManager = getDataset(NamespaceId.DEFAULT.dataset(NaiveBayesTrainer.TEXTS_TO_CLASSIFY));
    MockSource.writeInput(inputManager, messagesToWrite);
    // manually trigger the pipeline
    WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
    workflowManager.start();
    workflowManager.waitForRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
    DataSetManager<Table> classifiedTexts = getDataset(classifiedTextsTable);
    List<StructuredRecord> structuredRecords = MockSink.readOutput(classifiedTexts);
    Set<SpamMessage> results = new HashSet<>();
    for (StructuredRecord structuredRecord : structuredRecords) {
        results.add(SpamMessage.fromStructuredRecord(structuredRecord));
    }
    Set<SpamMessage> expected = new HashSet<>();
    expected.add(new SpamMessage("how are you doing today", 0.0));
    // only 'free money money' should be predicated as spam
    expected.add(new SpamMessage("free money money", 1.0));
    expected.add(new SpamMessage("what are you doing today", 0.0));
    expected.add(new SpamMessage("genuine report", 0.0));
    Assert.assertEquals(expected, results);
    validateMetric(4, appId, "source.records.out");
    validateMetric(4, appId, "sparkcompute.records.in");
    validateMetric(4, appId, "sink.records.in");
}
Also used : ApplicationManager(co.cask.cdap.test.ApplicationManager) KeyValueTable(co.cask.cdap.api.dataset.lib.KeyValueTable) Table(co.cask.cdap.api.dataset.table.Table) SpamMessage(co.cask.cdap.datapipeline.mock.SpamMessage) WorkflowManager(co.cask.cdap.test.WorkflowManager) ArrayList(java.util.ArrayList) ETLPlugin(co.cask.cdap.etl.proto.v2.ETLPlugin) StructuredRecord(co.cask.cdap.api.data.format.StructuredRecord) AppRequest(co.cask.cdap.proto.artifact.AppRequest) ETLBatchConfig(co.cask.cdap.etl.proto.v2.ETLBatchConfig) ETLStage(co.cask.cdap.etl.proto.v2.ETLStage) ApplicationId(co.cask.cdap.proto.id.ApplicationId) HashSet(java.util.HashSet)

Aggregations

ETLPlugin (co.cask.cdap.etl.proto.v2.ETLPlugin)11 ETLStage (co.cask.cdap.etl.proto.v2.ETLStage)11 ETLBatchConfig (co.cask.cdap.etl.proto.v2.ETLBatchConfig)8 ApplicationId (co.cask.cdap.proto.id.ApplicationId)8 ApplicationManager (co.cask.cdap.test.ApplicationManager)8 WorkflowManager (co.cask.cdap.test.WorkflowManager)8 StructuredRecord (co.cask.cdap.api.data.format.StructuredRecord)7 KeyValueTable (co.cask.cdap.api.dataset.lib.KeyValueTable)7 Table (co.cask.cdap.api.dataset.table.Table)7 ArrayList (java.util.ArrayList)7 Test (org.junit.Test)7 HashMap (java.util.HashMap)6 Schema (co.cask.cdap.api.data.schema.Schema)5 AppRequest (co.cask.cdap.proto.artifact.AppRequest)3 HashSet (java.util.HashSet)3 ArtifactId (co.cask.cdap.api.artifact.ArtifactId)2 FileSet (co.cask.cdap.api.dataset.lib.FileSet)2 SpamMessage (co.cask.cdap.datapipeline.mock.SpamMessage)2 BatchPipelineSpecGenerator (co.cask.cdap.etl.batch.BatchPipelineSpecGenerator)2 MockPluginConfigurer (co.cask.cdap.etl.common.MockPluginConfigurer)2