use of co.cask.cdap.etl.proto.v2.ETLPlugin in project cdap by caskdata.
the class PipelineSpecGeneratorTest method testPipelineProperties.
@Test
public void testPipelineProperties() {
// populate some mock plugins.
MockPluginConfigurer pluginConfigurer = new MockPluginConfigurer();
Set<ArtifactId> artifactIds = ImmutableSet.of(ARTIFACT_ID);
pluginConfigurer.addMockPlugin(Action.PLUGIN_TYPE, "action1", MockPlugin.builder().putPipelineProperty("prop1", "val1").putPipelineProperty("prop2", "val2").build(), artifactIds);
pluginConfigurer.addMockPlugin(Action.PLUGIN_TYPE, "action2", MockPlugin.builder().putPipelineProperty("prop2", "val2").build(), artifactIds);
PipelineSpecGenerator specGenerator = new BatchPipelineSpecGenerator(pluginConfigurer, ImmutableSet.of(BatchSource.PLUGIN_TYPE), ImmutableSet.of(BatchSink.PLUGIN_TYPE), FileSet.class, DatasetProperties.EMPTY, Engine.MAPREDUCE);
Map<String, String> empty = ImmutableMap.of();
ETLConfig config = ETLBatchConfig.builder("* * * * *").setProperties(ImmutableMap.of("system.spark.spark.test", "abc", "system.mapreduce.prop3", "val3")).addStage(new ETLStage("a1", new ETLPlugin("action1", Action.PLUGIN_TYPE, empty))).addStage(new ETLStage("a2", new ETLPlugin("action2", Action.PLUGIN_TYPE, empty))).addConnection("a1", "a2").setEngine(Engine.MAPREDUCE).build();
PipelineSpec actual = specGenerator.generateSpec(config);
PipelineSpec expected = BatchPipelineSpec.builder().addConnection("a1", "a2").setProperties(ImmutableMap.of("prop1", "val1", "prop2", "val2", "prop3", "val3")).addStage(StageSpec.builder("a1", new PluginSpec(Action.PLUGIN_TYPE, "action1", empty, ARTIFACT_ID)).addOutputs("a2").build()).addStage(StageSpec.builder("a2", new PluginSpec(Action.PLUGIN_TYPE, "action2", empty, ARTIFACT_ID)).addInputs("a1").build()).setResources(new Resources(1024)).setDriverResources(new Resources(1024)).setClientResources(new Resources(1024)).build();
Assert.assertEquals(expected, actual);
}
use of co.cask.cdap.etl.proto.v2.ETLPlugin in project cdap by caskdata.
the class PipelineTest method testWordCount.
public void testWordCount(String pluginType) throws Exception {
String inputName = "wcInput-" + pluginType;
String outputName = "wcOutput-" + pluginType;
// create the pipeline config
ETLStage source = new ETLStage("wcInput", MockSource.getPlugin(inputName));
ETLStage sink = new ETLStage("wcOutput", MockSink.getPlugin(outputName));
Map<String, String> aggProperties = new HashMap<>();
aggProperties.put("field", "text");
ETLStage agg = new ETLStage("middle", new ETLPlugin("WordCount", pluginType, aggProperties, null));
ETLBatchConfig pipelineConfig = ETLBatchConfig.builder("* * * * *").addStage(source).addStage(sink).addStage(agg).addConnection(source.getName(), agg.getName()).addConnection(agg.getName(), sink.getName()).build();
// create the pipeline
ApplicationId pipelineId = NamespaceId.DEFAULT.app("wcTestPipeline-" + pluginType);
ApplicationManager appManager = deployApplication(pipelineId, new AppRequest<>(APP_ARTIFACT, pipelineConfig));
// write the input
Schema inputSchema = Schema.recordOf("text", Schema.Field.of("text", Schema.of(Schema.Type.STRING)));
DataSetManager<Table> inputManager = getDataset(inputName);
List<StructuredRecord> inputRecords = new ArrayList<>();
inputRecords.add(StructuredRecord.builder(inputSchema).set("text", "Hello World").build());
inputRecords.add(StructuredRecord.builder(inputSchema).set("text", "Hello my name is Hal").build());
inputRecords.add(StructuredRecord.builder(inputSchema).set("text", "Hello my name is Sam").build());
MockSource.writeInput(inputManager, inputRecords);
WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
workflowManager.start();
workflowManager.waitForFinish(4, TimeUnit.MINUTES);
DataSetManager<Table> outputManager = getDataset(outputName);
Set<StructuredRecord> outputRecords = new HashSet<>();
outputRecords.addAll(MockSink.readOutput(outputManager));
Set<StructuredRecord> expected = new HashSet<>();
expected.add(StructuredRecord.builder(WordCountAggregator.OUTPUT_SCHEMA).set("word", "Hello").set("count", 3L).build());
expected.add(StructuredRecord.builder(WordCountAggregator.OUTPUT_SCHEMA).set("word", "World").set("count", 1L).build());
expected.add(StructuredRecord.builder(WordCountAggregator.OUTPUT_SCHEMA).set("word", "my").set("count", 2L).build());
expected.add(StructuredRecord.builder(WordCountAggregator.OUTPUT_SCHEMA).set("word", "name").set("count", 2L).build());
expected.add(StructuredRecord.builder(WordCountAggregator.OUTPUT_SCHEMA).set("word", "is").set("count", 2L).build());
expected.add(StructuredRecord.builder(WordCountAggregator.OUTPUT_SCHEMA).set("word", "Hal").set("count", 1L).build());
expected.add(StructuredRecord.builder(WordCountAggregator.OUTPUT_SCHEMA).set("word", "Sam").set("count", 1L).build());
Assert.assertEquals(expected, outputRecords);
}
use of co.cask.cdap.etl.proto.v2.ETLPlugin in project cdap by caskdata.
the class PipelineSpecGeneratorTest method testConflictingPipelineProperties.
@Test(expected = IllegalArgumentException.class)
public void testConflictingPipelineProperties() {
// populate some mock plugins.
MockPluginConfigurer pluginConfigurer = new MockPluginConfigurer();
Set<ArtifactId> artifactIds = ImmutableSet.of(ARTIFACT_ID);
pluginConfigurer.addMockPlugin(Action.PLUGIN_TYPE, "action1", MockPlugin.builder().putPipelineProperty("prop1", "val1").build(), artifactIds);
pluginConfigurer.addMockPlugin(Action.PLUGIN_TYPE, "action2", MockPlugin.builder().putPipelineProperty("prop1", "val2").build(), artifactIds);
PipelineSpecGenerator specGenerator = new BatchPipelineSpecGenerator(pluginConfigurer, ImmutableSet.of(BatchSource.PLUGIN_TYPE), ImmutableSet.of(BatchSink.PLUGIN_TYPE), FileSet.class, DatasetProperties.EMPTY, Engine.MAPREDUCE);
Map<String, String> empty = ImmutableMap.of();
ETLConfig config = ETLBatchConfig.builder("* * * * *").addStage(new ETLStage("a1", new ETLPlugin("action1", Action.PLUGIN_TYPE, empty))).addStage(new ETLStage("a2", new ETLPlugin("action2", Action.PLUGIN_TYPE, empty))).addConnection("a1", "a2").setEngine(Engine.MAPREDUCE).build();
specGenerator.generateSpec(config);
}
use of co.cask.cdap.etl.proto.v2.ETLPlugin in project cdap by caskdata.
the class DataPipelineTest method testSinglePhaseWithSparkSink.
private void testSinglePhaseWithSparkSink() throws Exception {
/*
* source1 ---|
* |--> sparksink
* source2 ---|
*/
ETLBatchConfig etlConfig = ETLBatchConfig.builder("* * * * *").addStage(new ETLStage("source1", MockSource.getPlugin("messages1", SpamMessage.SCHEMA))).addStage(new ETLStage("source2", MockSource.getPlugin("messages2", SpamMessage.SCHEMA))).addStage(new ETLStage("customsink", new ETLPlugin(NaiveBayesTrainer.PLUGIN_NAME, SparkSink.PLUGIN_TYPE, ImmutableMap.of("fileSetName", "modelFileSet", "path", "output", "fieldToClassify", SpamMessage.TEXT_FIELD, "predictionField", SpamMessage.SPAM_PREDICTION_FIELD), null))).addConnection("source1", "customsink").addConnection("source2", "customsink").build();
AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
ApplicationId appId = NamespaceId.DEFAULT.app("SparkSinkApp");
ApplicationManager appManager = deployApplication(appId.toId(), appRequest);
// set up five spam messages and five non-spam messages to be used for classification
List<StructuredRecord> messagesToWrite = new ArrayList<>();
messagesToWrite.add(new SpamMessage("buy our clothes", 1.0).toStructuredRecord());
messagesToWrite.add(new SpamMessage("sell your used books to us", 1.0).toStructuredRecord());
messagesToWrite.add(new SpamMessage("earn money for free", 1.0).toStructuredRecord());
messagesToWrite.add(new SpamMessage("this is definitely not spam", 1.0).toStructuredRecord());
messagesToWrite.add(new SpamMessage("you won the lottery", 1.0).toStructuredRecord());
// write records to source1
DataSetManager<Table> inputManager = getDataset(NamespaceId.DEFAULT.dataset("messages1"));
MockSource.writeInput(inputManager, messagesToWrite);
messagesToWrite.clear();
messagesToWrite.add(new SpamMessage("how was your day", 0.0).toStructuredRecord());
messagesToWrite.add(new SpamMessage("what are you up to", 0.0).toStructuredRecord());
messagesToWrite.add(new SpamMessage("this is a genuine message", 0.0).toStructuredRecord());
messagesToWrite.add(new SpamMessage("this is an even more genuine message", 0.0).toStructuredRecord());
messagesToWrite.add(new SpamMessage("could you send me the report", 0.0).toStructuredRecord());
// write records to source2
inputManager = getDataset(NamespaceId.DEFAULT.dataset("messages2"));
MockSource.writeInput(inputManager, messagesToWrite);
// ingest in some messages to be classified
StreamManager textsToClassify = getStreamManager(NaiveBayesTrainer.TEXTS_TO_CLASSIFY);
textsToClassify.send("how are you doing today");
textsToClassify.send("free money money");
textsToClassify.send("what are you doing today");
textsToClassify.send("genuine report");
// manually trigger the pipeline
WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
workflowManager.start();
workflowManager.waitForRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
DataSetManager<KeyValueTable> classifiedTexts = getDataset(NaiveBayesTrainer.CLASSIFIED_TEXTS);
Assert.assertEquals(0.0d, Bytes.toDouble(classifiedTexts.get().read("how are you doing today")), 0.01d);
// only 'free money money' should be predicated as spam
Assert.assertEquals(1.0d, Bytes.toDouble(classifiedTexts.get().read("free money money")), 0.01d);
Assert.assertEquals(0.0d, Bytes.toDouble(classifiedTexts.get().read("what are you doing today")), 0.01d);
Assert.assertEquals(0.0d, Bytes.toDouble(classifiedTexts.get().read("genuine report")), 0.01d);
validateMetric(5, appId, "source1.records.out");
validateMetric(5, appId, "source2.records.out");
validateMetric(10, appId, "customsink.records.in");
}
use of co.cask.cdap.etl.proto.v2.ETLPlugin in project cdap by caskdata.
the class DataPipelineTest method testSinglePhaseWithSparkCompute.
private void testSinglePhaseWithSparkCompute() throws Exception {
/*
* source --> sparkcompute --> sink
*/
String classifiedTextsTable = "classifiedTextTable";
ETLBatchConfig etlConfig = ETLBatchConfig.builder("* * * * *").addStage(new ETLStage("source", MockSource.getPlugin(NaiveBayesTrainer.TEXTS_TO_CLASSIFY, SpamMessage.SCHEMA))).addStage(new ETLStage("sparkcompute", new ETLPlugin(NaiveBayesClassifier.PLUGIN_NAME, SparkCompute.PLUGIN_TYPE, ImmutableMap.of("fileSetName", "modelFileSet", "path", "output", "fieldToClassify", SpamMessage.TEXT_FIELD, "fieldToSet", SpamMessage.SPAM_PREDICTION_FIELD), null))).addStage(new ETLStage("sink", MockSink.getPlugin(classifiedTextsTable))).addConnection("source", "sparkcompute").addConnection("sparkcompute", "sink").build();
AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
ApplicationId appId = NamespaceId.DEFAULT.app("SparkComputeApp");
ApplicationManager appManager = deployApplication(appId.toId(), appRequest);
// write some some messages to be classified
List<StructuredRecord> messagesToWrite = new ArrayList<>();
messagesToWrite.add(new SpamMessage("how are you doing today").toStructuredRecord());
messagesToWrite.add(new SpamMessage("free money money").toStructuredRecord());
messagesToWrite.add(new SpamMessage("what are you doing today").toStructuredRecord());
messagesToWrite.add(new SpamMessage("genuine report").toStructuredRecord());
DataSetManager<Table> inputManager = getDataset(NamespaceId.DEFAULT.dataset(NaiveBayesTrainer.TEXTS_TO_CLASSIFY));
MockSource.writeInput(inputManager, messagesToWrite);
// manually trigger the pipeline
WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
workflowManager.start();
workflowManager.waitForRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
DataSetManager<Table> classifiedTexts = getDataset(classifiedTextsTable);
List<StructuredRecord> structuredRecords = MockSink.readOutput(classifiedTexts);
Set<SpamMessage> results = new HashSet<>();
for (StructuredRecord structuredRecord : structuredRecords) {
results.add(SpamMessage.fromStructuredRecord(structuredRecord));
}
Set<SpamMessage> expected = new HashSet<>();
expected.add(new SpamMessage("how are you doing today", 0.0));
// only 'free money money' should be predicated as spam
expected.add(new SpamMessage("free money money", 1.0));
expected.add(new SpamMessage("what are you doing today", 0.0));
expected.add(new SpamMessage("genuine report", 0.0));
Assert.assertEquals(expected, results);
validateMetric(4, appId, "source.records.out");
validateMetric(4, appId, "sparkcompute.records.in");
validateMetric(4, appId, "sink.records.in");
}
Aggregations