Search in sources :

Example 81 with KeyValueTable

use of co.cask.cdap.api.dataset.lib.KeyValueTable in project cdap by caskdata.

the class DataPipelineTest method testKVTableLookup.

@Test
public void testKVTableLookup() throws Exception {
    addDatasetInstance(KeyValueTable.class.getName(), "ageTable");
    DataSetManager<KeyValueTable> lookupTable = getDataset("ageTable");
    lookupTable.get().write("samuel".getBytes(Charsets.UTF_8), "12".getBytes(Charsets.UTF_8));
    lookupTable.get().write("bob".getBytes(Charsets.UTF_8), "36".getBytes(Charsets.UTF_8));
    lookupTable.get().write("jane".getBytes(Charsets.UTF_8), "25".getBytes(Charsets.UTF_8));
    lookupTable.flush();
    ETLBatchConfig etlConfig = ETLBatchConfig.builder("* * * * *").addStage(new ETLStage("source", MockSource.getPlugin("inputTable"))).addStage(new ETLStage("transform", LookupTransform.getPlugin("person", "age", "ageTable"))).addStage(new ETLStage("sink", MockSink.getPlugin("outputTable"))).addConnection("source", "transform").addConnection("transform", "sink").build();
    ApplicationId appId = NamespaceId.DEFAULT.app("testKVTableLookup");
    AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
    ApplicationManager appManager = deployApplication(appId, appRequest);
    // set up input data
    Schema inputSchema = Schema.recordOf("person", Schema.Field.of("person", Schema.of(Schema.Type.STRING)));
    StructuredRecord recordSamuel = StructuredRecord.builder(inputSchema).set("person", "samuel").build();
    StructuredRecord recordBob = StructuredRecord.builder(inputSchema).set("person", "bob").build();
    StructuredRecord recordJane = StructuredRecord.builder(inputSchema).set("person", "jane").build();
    DataSetManager<Table> inputTable = getDataset("inputTable");
    MockSource.writeInput(inputTable, ImmutableList.of(recordSamuel, recordBob, recordJane));
    WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME).start();
    workflowManager.waitForRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
    Schema schema = Schema.recordOf("person", Schema.Field.of("person", Schema.of(Schema.Type.STRING)), Schema.Field.of("age", Schema.of(Schema.Type.STRING)));
    Set<StructuredRecord> expected = new HashSet<>();
    expected.add(StructuredRecord.builder(schema).set("person", "samuel").set("age", "12").build());
    expected.add(StructuredRecord.builder(schema).set("person", "bob").set("age", "36").build());
    expected.add(StructuredRecord.builder(schema).set("person", "jane").set("age", "25").build());
    DataSetManager<Table> outputTable = getDataset("outputTable");
    Set<StructuredRecord> actual = new HashSet<>(MockSink.readOutput(outputTable));
    Assert.assertEquals(expected, actual);
    validateMetric(3, appId, "source.records.out");
    validateMetric(3, appId, "sink.records.in");
    deleteDatasetInstance(NamespaceId.DEFAULT.dataset("inputTable"));
    deleteDatasetInstance(NamespaceId.DEFAULT.dataset("outputTable"));
}
Also used : ApplicationManager(co.cask.cdap.test.ApplicationManager) KeyValueTable(co.cask.cdap.api.dataset.lib.KeyValueTable) Table(co.cask.cdap.api.dataset.table.Table) Schema(co.cask.cdap.api.data.schema.Schema) WorkflowManager(co.cask.cdap.test.WorkflowManager) StructuredRecord(co.cask.cdap.api.data.format.StructuredRecord) AppRequest(co.cask.cdap.proto.artifact.AppRequest) ETLBatchConfig(co.cask.cdap.etl.proto.v2.ETLBatchConfig) ETLStage(co.cask.cdap.etl.proto.v2.ETLStage) KeyValueTable(co.cask.cdap.api.dataset.lib.KeyValueTable) ApplicationId(co.cask.cdap.proto.id.ApplicationId) HashSet(java.util.HashSet) Test(org.junit.Test)

Example 82 with KeyValueTable

use of co.cask.cdap.api.dataset.lib.KeyValueTable in project cdap by caskdata.

the class DataPipelineTest method testSinglePhaseWithSparkSink.

private void testSinglePhaseWithSparkSink() throws Exception {
    /*
     * source1 ---|
     *            |--> sparksink
     * source2 ---|
     */
    ETLBatchConfig etlConfig = ETLBatchConfig.builder("* * * * *").addStage(new ETLStage("source1", MockSource.getPlugin("messages1", SpamMessage.SCHEMA))).addStage(new ETLStage("source2", MockSource.getPlugin("messages2", SpamMessage.SCHEMA))).addStage(new ETLStage("customsink", new ETLPlugin(NaiveBayesTrainer.PLUGIN_NAME, SparkSink.PLUGIN_TYPE, ImmutableMap.of("fileSetName", "modelFileSet", "path", "output", "fieldToClassify", SpamMessage.TEXT_FIELD, "predictionField", SpamMessage.SPAM_PREDICTION_FIELD), null))).addConnection("source1", "customsink").addConnection("source2", "customsink").build();
    AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
    ApplicationId appId = NamespaceId.DEFAULT.app("SparkSinkApp");
    ApplicationManager appManager = deployApplication(appId, appRequest);
    // set up five spam messages and five non-spam messages to be used for classification
    List<StructuredRecord> messagesToWrite = new ArrayList<>();
    messagesToWrite.add(new SpamMessage("buy our clothes", 1.0).toStructuredRecord());
    messagesToWrite.add(new SpamMessage("sell your used books to us", 1.0).toStructuredRecord());
    messagesToWrite.add(new SpamMessage("earn money for free", 1.0).toStructuredRecord());
    messagesToWrite.add(new SpamMessage("this is definitely not spam", 1.0).toStructuredRecord());
    messagesToWrite.add(new SpamMessage("you won the lottery", 1.0).toStructuredRecord());
    // write records to source1
    DataSetManager<Table> inputManager = getDataset(NamespaceId.DEFAULT.dataset("messages1"));
    MockSource.writeInput(inputManager, messagesToWrite);
    messagesToWrite.clear();
    messagesToWrite.add(new SpamMessage("how was your day", 0.0).toStructuredRecord());
    messagesToWrite.add(new SpamMessage("what are you up to", 0.0).toStructuredRecord());
    messagesToWrite.add(new SpamMessage("this is a genuine message", 0.0).toStructuredRecord());
    messagesToWrite.add(new SpamMessage("this is an even more genuine message", 0.0).toStructuredRecord());
    messagesToWrite.add(new SpamMessage("could you send me the report", 0.0).toStructuredRecord());
    // write records to source2
    inputManager = getDataset(NamespaceId.DEFAULT.dataset("messages2"));
    MockSource.writeInput(inputManager, messagesToWrite);
    // ingest in some messages to be classified
    StreamManager textsToClassify = getStreamManager(NaiveBayesTrainer.TEXTS_TO_CLASSIFY);
    textsToClassify.send("how are you doing today");
    textsToClassify.send("free money money");
    textsToClassify.send("what are you doing today");
    textsToClassify.send("genuine report");
    // manually trigger the pipeline
    WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
    workflowManager.start();
    workflowManager.waitForRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
    DataSetManager<KeyValueTable> classifiedTexts = getDataset(NaiveBayesTrainer.CLASSIFIED_TEXTS);
    Assert.assertEquals(0.0d, Bytes.toDouble(classifiedTexts.get().read("how are you doing today")), 0.01d);
    // only 'free money money' should be predicated as spam
    Assert.assertEquals(1.0d, Bytes.toDouble(classifiedTexts.get().read("free money money")), 0.01d);
    Assert.assertEquals(0.0d, Bytes.toDouble(classifiedTexts.get().read("what are you doing today")), 0.01d);
    Assert.assertEquals(0.0d, Bytes.toDouble(classifiedTexts.get().read("genuine report")), 0.01d);
    validateMetric(5, appId, "source1.records.out");
    validateMetric(5, appId, "source2.records.out");
    validateMetric(10, appId, "customsink.records.in");
}
Also used : ApplicationManager(co.cask.cdap.test.ApplicationManager) KeyValueTable(co.cask.cdap.api.dataset.lib.KeyValueTable) Table(co.cask.cdap.api.dataset.table.Table) SpamMessage(co.cask.cdap.datapipeline.mock.SpamMessage) WorkflowManager(co.cask.cdap.test.WorkflowManager) ArrayList(java.util.ArrayList) ETLPlugin(co.cask.cdap.etl.proto.v2.ETLPlugin) StructuredRecord(co.cask.cdap.api.data.format.StructuredRecord) AppRequest(co.cask.cdap.proto.artifact.AppRequest) ETLBatchConfig(co.cask.cdap.etl.proto.v2.ETLBatchConfig) ETLStage(co.cask.cdap.etl.proto.v2.ETLStage) StreamManager(co.cask.cdap.test.StreamManager) KeyValueTable(co.cask.cdap.api.dataset.lib.KeyValueTable) ApplicationId(co.cask.cdap.proto.id.ApplicationId)

Example 83 with KeyValueTable

use of co.cask.cdap.api.dataset.lib.KeyValueTable in project cdap by caskdata.

the class MultiConsumerTest method testMulti.

@Test
public void testMulti() throws Exception {
    // TODO: Fix this test case to really test with numGroups settings.
    final ApplicationWithPrograms app = AppFabricTestHelper.deployApplicationWithManager(MultiApp.class, TEMP_FOLDER_SUPPLIER);
    List<ProgramController> controllers = Lists.newArrayList();
    for (ProgramDescriptor programDescriptor : app.getPrograms()) {
        controllers.add(AppFabricTestHelper.submit(app, programDescriptor.getSpecification().getClassName(), new BasicArguments(), TEMP_FOLDER_SUPPLIER));
    }
    DatasetFramework datasetFramework = AppFabricTestHelper.getInjector().getInstance(DatasetFramework.class);
    DynamicDatasetCache datasetCache = new SingleThreadDatasetCache(new SystemDatasetInstantiator(datasetFramework, getClass().getClassLoader(), null), AppFabricTestHelper.getInjector().getInstance(TransactionSystemClient.class), NamespaceId.DEFAULT, DatasetDefinition.NO_ARGUMENTS, null, null);
    final KeyValueTable accumulated = datasetCache.getDataset("accumulated");
    TransactionExecutorFactory txExecutorFactory = AppFabricTestHelper.getInjector().getInstance(TransactionExecutorFactory.class);
    // Try to get accumulated result and verify it. Expect result appear in max of 60 seconds.
    int trial = 0;
    while (trial < 60) {
        try {
            Transactions.createTransactionExecutor(txExecutorFactory, accumulated).execute(new TransactionExecutor.Subroutine() {

                @Override
                public void apply() throws Exception {
                    byte[] value = accumulated.read(MultiApp.KEY);
                    // Sum(1..100) * 3
                    Assert.assertEquals(((1 + 99) * 99 / 2) * 3, Longs.fromByteArray(value));
                }
            });
            break;
        } catch (TransactionFailureException e) {
            // No-op
            trial++;
            TimeUnit.SECONDS.sleep(1);
        }
    }
    Assert.assertTrue(trial < 60);
    for (ProgramController controller : controllers) {
        controller.stop().get();
    }
}
Also used : ProgramController(co.cask.cdap.app.runtime.ProgramController) DynamicDatasetCache(co.cask.cdap.data2.dataset2.DynamicDatasetCache) TransactionExecutor(org.apache.tephra.TransactionExecutor) SingleThreadDatasetCache(co.cask.cdap.data2.dataset2.SingleThreadDatasetCache) TransactionFailureException(org.apache.tephra.TransactionFailureException) IOException(java.io.IOException) TransactionExecutorFactory(org.apache.tephra.TransactionExecutorFactory) DatasetFramework(co.cask.cdap.data2.dataset2.DatasetFramework) TransactionSystemClient(org.apache.tephra.TransactionSystemClient) TransactionFailureException(org.apache.tephra.TransactionFailureException) ApplicationWithPrograms(co.cask.cdap.internal.app.deploy.pipeline.ApplicationWithPrograms) SystemDatasetInstantiator(co.cask.cdap.data.dataset.SystemDatasetInstantiator) KeyValueTable(co.cask.cdap.api.dataset.lib.KeyValueTable) ProgramDescriptor(co.cask.cdap.app.program.ProgramDescriptor) BasicArguments(co.cask.cdap.internal.app.runtime.BasicArguments) Test(org.junit.Test)

Example 84 with KeyValueTable

use of co.cask.cdap.api.dataset.lib.KeyValueTable in project cdap by caskdata.

the class PipelineTest method testWordCountSparkSink.

@SuppressWarnings("ConstantConditions")
@Test
public void testWordCountSparkSink() throws Exception {
    String inputName = "sparkSinkInput";
    String outputName = "sparkSinkOutput";
    // create the pipeline config
    ETLStage source = new ETLStage("source", MockSource.getPlugin(inputName));
    Map<String, String> sinkProperties = new HashMap<>();
    sinkProperties.put("field", "text");
    sinkProperties.put("tableName", outputName);
    ETLStage sink = new ETLStage("sink", new ETLPlugin(WordCountSink.NAME, SparkSink.PLUGIN_TYPE, sinkProperties, null));
    ETLBatchConfig pipelineConfig = ETLBatchConfig.builder("* * * * *").addStage(source).addStage(sink).addConnection(source.getName(), sink.getName()).build();
    // create the pipeline
    ApplicationId pipelineId = NamespaceId.DEFAULT.app("sparkSinkTestPipeline");
    ApplicationManager appManager = deployApplication(pipelineId, new AppRequest<>(APP_ARTIFACT, pipelineConfig));
    // write the input
    Schema inputSchema = Schema.recordOf("text", Schema.Field.of("text", Schema.of(Schema.Type.STRING)));
    DataSetManager<Table> inputManager = getDataset(inputName);
    List<StructuredRecord> inputRecords = new ArrayList<>();
    inputRecords.add(StructuredRecord.builder(inputSchema).set("text", "Hello World").build());
    inputRecords.add(StructuredRecord.builder(inputSchema).set("text", "Hello my name is Hal").build());
    inputRecords.add(StructuredRecord.builder(inputSchema).set("text", "Hello my name is Sam").build());
    MockSource.writeInput(inputManager, inputRecords);
    WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
    workflowManager.start();
    workflowManager.waitForFinish(4, TimeUnit.MINUTES);
    DataSetManager<KeyValueTable> outputManager = getDataset(outputName);
    KeyValueTable output = outputManager.get();
    Assert.assertEquals(3L, Bytes.toLong(output.read("Hello")));
    Assert.assertEquals(1L, Bytes.toLong(output.read("World")));
    Assert.assertEquals(2L, Bytes.toLong(output.read("my")));
    Assert.assertEquals(2L, Bytes.toLong(output.read("name")));
    Assert.assertEquals(2L, Bytes.toLong(output.read("is")));
    Assert.assertEquals(1L, Bytes.toLong(output.read("Hal")));
    Assert.assertEquals(1L, Bytes.toLong(output.read("Sam")));
}
Also used : ApplicationManager(co.cask.cdap.test.ApplicationManager) Table(co.cask.cdap.api.dataset.table.Table) KeyValueTable(co.cask.cdap.api.dataset.lib.KeyValueTable) HashMap(java.util.HashMap) Schema(co.cask.cdap.api.data.schema.Schema) WorkflowManager(co.cask.cdap.test.WorkflowManager) ArrayList(java.util.ArrayList) ETLPlugin(co.cask.cdap.etl.proto.v2.ETLPlugin) StructuredRecord(co.cask.cdap.api.data.format.StructuredRecord) ETLBatchConfig(co.cask.cdap.etl.proto.v2.ETLBatchConfig) ETLStage(co.cask.cdap.etl.proto.v2.ETLStage) KeyValueTable(co.cask.cdap.api.dataset.lib.KeyValueTable) ApplicationId(co.cask.cdap.proto.id.ApplicationId) Test(org.junit.Test)

Aggregations

KeyValueTable (co.cask.cdap.api.dataset.lib.KeyValueTable)84 Test (org.junit.Test)49 ApplicationManager (co.cask.cdap.test.ApplicationManager)45 SparkManager (co.cask.cdap.test.SparkManager)25 StreamManager (co.cask.cdap.test.StreamManager)16 IOException (java.io.IOException)16 TransactionExecutor (org.apache.tephra.TransactionExecutor)12 ApplicationWithPrograms (co.cask.cdap.internal.app.deploy.pipeline.ApplicationWithPrograms)11 HashMap (java.util.HashMap)11 ArrayList (java.util.ArrayList)10 FileSet (co.cask.cdap.api.dataset.lib.FileSet)8 KeyValue (co.cask.cdap.api.dataset.lib.KeyValue)8 Table (co.cask.cdap.api.dataset.table.Table)8 NamespaceMeta (co.cask.cdap.proto.NamespaceMeta)8 ObjectStore (co.cask.cdap.api.dataset.lib.ObjectStore)7 MapReduceManager (co.cask.cdap.test.MapReduceManager)7 ServiceManager (co.cask.cdap.test.ServiceManager)7 WorkflowManager (co.cask.cdap.test.WorkflowManager)7 Set (java.util.Set)7 Category (org.junit.experimental.categories.Category)7