Search in sources :

Example 1 with SpamMessage

use of io.cdap.cdap.datapipeline.mock.SpamMessage in project cdap by caskdata.

the class DataPipelineTest method testSinglePhaseWithSparkSink.

private void testSinglePhaseWithSparkSink() throws Exception {
    /*
     * source1 ---|
     *            |--> sparksink
     * source2 ---|
     */
    ETLBatchConfig etlConfig = ETLBatchConfig.builder().addStage(new ETLStage("source1", MockSource.getPlugin("messages1", SpamMessage.SCHEMA))).addStage(new ETLStage("source2", MockSource.getPlugin("messages2", SpamMessage.SCHEMA))).addStage(new ETLStage("customsink", new ETLPlugin(NaiveBayesTrainer.PLUGIN_NAME, SparkSink.PLUGIN_TYPE, ImmutableMap.of("fileSetName", "modelFileSet", "path", "output", "fieldToClassify", SpamMessage.TEXT_FIELD, "predictionField", SpamMessage.SPAM_PREDICTION_FIELD), null))).addConnection("source1", "customsink").addConnection("source2", "customsink").build();
    AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
    ApplicationId appId = NamespaceId.DEFAULT.app("SparkSinkApp");
    ApplicationManager appManager = deployApplication(appId, appRequest);
    // set up five spam messages and five non-spam messages to be used for classification
    List<StructuredRecord> messagesToWrite = new ArrayList<>();
    messagesToWrite.add(new SpamMessage("buy our clothes", 1.0).toStructuredRecord());
    messagesToWrite.add(new SpamMessage("sell your used books to us", 1.0).toStructuredRecord());
    messagesToWrite.add(new SpamMessage("earn money for free", 1.0).toStructuredRecord());
    messagesToWrite.add(new SpamMessage("this is definitely not spam", 1.0).toStructuredRecord());
    messagesToWrite.add(new SpamMessage("you won the lottery", 1.0).toStructuredRecord());
    // write records to source1
    DataSetManager<Table> inputManager = getDataset(NamespaceId.DEFAULT.dataset("messages1"));
    MockSource.writeInput(inputManager, messagesToWrite);
    messagesToWrite.clear();
    messagesToWrite.add(new SpamMessage("how was your day", 0.0).toStructuredRecord());
    messagesToWrite.add(new SpamMessage("what are you up to", 0.0).toStructuredRecord());
    messagesToWrite.add(new SpamMessage("this is a genuine message", 0.0).toStructuredRecord());
    messagesToWrite.add(new SpamMessage("this is an even more genuine message", 0.0).toStructuredRecord());
    messagesToWrite.add(new SpamMessage("could you send me the report", 0.0).toStructuredRecord());
    // write records to source2
    inputManager = getDataset(NamespaceId.DEFAULT.dataset("messages2"));
    MockSource.writeInput(inputManager, messagesToWrite);
    // ingest in some messages to be classified
    DataSetManager<FileSet> fileSetManager = getDataset(NaiveBayesTrainer.TEXTS_TO_CLASSIFY);
    FileSet fileSet = fileSetManager.get();
    try (PrintStream out = new PrintStream(fileSet.getLocation("inputTexts").getOutputStream(), true, "UTF-8")) {
        out.println("how are you doing today");
        out.println("free money money");
        out.println("what are you doing today");
        out.println("genuine report");
    }
    // manually trigger the pipeline
    Map<String, String> runtimeArgs = new HashMap<>();
    FileSetArguments.setInputPath(runtimeArgs, "inputTexts");
    WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
    workflowManager.start(runtimeArgs);
    workflowManager.waitForRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
    DataSetManager<KeyValueTable> classifiedTexts = getDataset(NaiveBayesTrainer.CLASSIFIED_TEXTS);
    Assert.assertEquals(0.0d, Bytes.toDouble(classifiedTexts.get().read("how are you doing today")), 0.01d);
    // only 'free money money' should be predicated as spam
    Assert.assertEquals(1.0d, Bytes.toDouble(classifiedTexts.get().read("free money money")), 0.01d);
    Assert.assertEquals(0.0d, Bytes.toDouble(classifiedTexts.get().read("what are you doing today")), 0.01d);
    Assert.assertEquals(0.0d, Bytes.toDouble(classifiedTexts.get().read("genuine report")), 0.01d);
    validateMetric(5, appId, "source1.records.out");
    validateMetric(5, appId, "source2.records.out");
    validateMetric(10, appId, "customsink.records.in");
}
Also used : PrintStream(java.io.PrintStream) ApplicationManager(io.cdap.cdap.test.ApplicationManager) Table(io.cdap.cdap.api.dataset.table.Table) KeyValueTable(io.cdap.cdap.api.dataset.lib.KeyValueTable) FileSet(io.cdap.cdap.api.dataset.lib.FileSet) HashMap(java.util.HashMap) SpamMessage(io.cdap.cdap.datapipeline.mock.SpamMessage) WorkflowManager(io.cdap.cdap.test.WorkflowManager) ArrayList(java.util.ArrayList) ETLPlugin(io.cdap.cdap.etl.proto.v2.ETLPlugin) StructuredRecord(io.cdap.cdap.api.data.format.StructuredRecord) AppRequest(io.cdap.cdap.proto.artifact.AppRequest) ETLBatchConfig(io.cdap.cdap.etl.proto.v2.ETLBatchConfig) ETLStage(io.cdap.cdap.etl.proto.v2.ETLStage) KeyValueTable(io.cdap.cdap.api.dataset.lib.KeyValueTable) ApplicationId(io.cdap.cdap.proto.id.ApplicationId)

Example 2 with SpamMessage

use of io.cdap.cdap.datapipeline.mock.SpamMessage in project cdap by caskdata.

the class DataPipelineTest method testSinglePhaseWithSparkCompute.

private void testSinglePhaseWithSparkCompute() throws Exception {
    /*
     * source --> sparkcompute --> sink
     */
    String classifiedTextsTable = "classifiedTextTable";
    ETLBatchConfig etlConfig = ETLBatchConfig.builder().addStage(new ETLStage("source", MockSource.getPlugin(NaiveBayesTrainer.TEXTS_TO_CLASSIFY_SOURCE, SpamMessage.SCHEMA))).addStage(new ETLStage("sparkcompute", new ETLPlugin(NaiveBayesClassifier.PLUGIN_NAME, SparkCompute.PLUGIN_TYPE, ImmutableMap.of("fileSetName", "modelFileSet", "path", "output", "fieldToClassify", SpamMessage.TEXT_FIELD, "fieldToSet", SpamMessage.SPAM_PREDICTION_FIELD), null))).addStage(new ETLStage("sink", MockSink.getPlugin(classifiedTextsTable))).addConnection("source", "sparkcompute").addConnection("sparkcompute", "sink").build();
    AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
    ApplicationId appId = NamespaceId.DEFAULT.app("SparkComputeApp");
    ApplicationManager appManager = deployApplication(appId, appRequest);
    // write some some messages to be classified
    List<StructuredRecord> messagesToWrite = new ArrayList<>();
    messagesToWrite.add(new SpamMessage("how are you doing today").toStructuredRecord());
    messagesToWrite.add(new SpamMessage("free money money").toStructuredRecord());
    messagesToWrite.add(new SpamMessage("what are you doing today").toStructuredRecord());
    messagesToWrite.add(new SpamMessage("genuine report").toStructuredRecord());
    DataSetManager<Table> inputManager = getDataset(NamespaceId.DEFAULT.dataset(NaiveBayesTrainer.TEXTS_TO_CLASSIFY_SOURCE));
    MockSource.writeInput(inputManager, messagesToWrite);
    // manually trigger the pipeline
    WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
    workflowManager.start();
    workflowManager.waitForRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
    DataSetManager<Table> classifiedTexts = getDataset(classifiedTextsTable);
    List<StructuredRecord> structuredRecords = MockSink.readOutput(classifiedTexts);
    Set<SpamMessage> results = new HashSet<>();
    for (StructuredRecord structuredRecord : structuredRecords) {
        results.add(SpamMessage.fromStructuredRecord(structuredRecord));
    }
    Set<SpamMessage> expected = new HashSet<>();
    expected.add(new SpamMessage("how are you doing today", 0.0));
    // only 'free money money' should be predicated as spam
    expected.add(new SpamMessage("free money money", 1.0));
    expected.add(new SpamMessage("what are you doing today", 0.0));
    expected.add(new SpamMessage("genuine report", 0.0));
    Assert.assertEquals(expected, results);
    validateMetric(4, appId, "source.records.out");
    validateMetric(4, appId, "sparkcompute.records.in");
    validateMetric(4, appId, "sink.records.in");
}
Also used : ApplicationManager(io.cdap.cdap.test.ApplicationManager) Table(io.cdap.cdap.api.dataset.table.Table) KeyValueTable(io.cdap.cdap.api.dataset.lib.KeyValueTable) SpamMessage(io.cdap.cdap.datapipeline.mock.SpamMessage) WorkflowManager(io.cdap.cdap.test.WorkflowManager) ArrayList(java.util.ArrayList) ETLPlugin(io.cdap.cdap.etl.proto.v2.ETLPlugin) StructuredRecord(io.cdap.cdap.api.data.format.StructuredRecord) AppRequest(io.cdap.cdap.proto.artifact.AppRequest) ETLBatchConfig(io.cdap.cdap.etl.proto.v2.ETLBatchConfig) ETLStage(io.cdap.cdap.etl.proto.v2.ETLStage) ApplicationId(io.cdap.cdap.proto.id.ApplicationId) HashSet(java.util.HashSet)

Aggregations

StructuredRecord (io.cdap.cdap.api.data.format.StructuredRecord)2 KeyValueTable (io.cdap.cdap.api.dataset.lib.KeyValueTable)2 Table (io.cdap.cdap.api.dataset.table.Table)2 SpamMessage (io.cdap.cdap.datapipeline.mock.SpamMessage)2 ETLBatchConfig (io.cdap.cdap.etl.proto.v2.ETLBatchConfig)2 ETLPlugin (io.cdap.cdap.etl.proto.v2.ETLPlugin)2 ETLStage (io.cdap.cdap.etl.proto.v2.ETLStage)2 AppRequest (io.cdap.cdap.proto.artifact.AppRequest)2 ApplicationId (io.cdap.cdap.proto.id.ApplicationId)2 ApplicationManager (io.cdap.cdap.test.ApplicationManager)2 WorkflowManager (io.cdap.cdap.test.WorkflowManager)2 ArrayList (java.util.ArrayList)2 FileSet (io.cdap.cdap.api.dataset.lib.FileSet)1 PrintStream (java.io.PrintStream)1 HashMap (java.util.HashMap)1 HashSet (java.util.HashSet)1