use of co.cask.cdap.api.dataset.lib.KeyValueTable in project cdap by caskdata.
the class DataPipelineTest method testKVTableLookup.
@Test
public void testKVTableLookup() throws Exception {
addDatasetInstance(KeyValueTable.class.getName(), "ageTable");
DataSetManager<KeyValueTable> lookupTable = getDataset("ageTable");
lookupTable.get().write("samuel".getBytes(Charsets.UTF_8), "12".getBytes(Charsets.UTF_8));
lookupTable.get().write("bob".getBytes(Charsets.UTF_8), "36".getBytes(Charsets.UTF_8));
lookupTable.get().write("jane".getBytes(Charsets.UTF_8), "25".getBytes(Charsets.UTF_8));
lookupTable.flush();
ETLBatchConfig etlConfig = ETLBatchConfig.builder("* * * * *").addStage(new ETLStage("source", MockSource.getPlugin("inputTable"))).addStage(new ETLStage("transform", LookupTransform.getPlugin("person", "age", "ageTable"))).addStage(new ETLStage("sink", MockSink.getPlugin("outputTable"))).addConnection("source", "transform").addConnection("transform", "sink").build();
ApplicationId appId = NamespaceId.DEFAULT.app("testKVTableLookup");
AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
ApplicationManager appManager = deployApplication(appId, appRequest);
// set up input data
Schema inputSchema = Schema.recordOf("person", Schema.Field.of("person", Schema.of(Schema.Type.STRING)));
StructuredRecord recordSamuel = StructuredRecord.builder(inputSchema).set("person", "samuel").build();
StructuredRecord recordBob = StructuredRecord.builder(inputSchema).set("person", "bob").build();
StructuredRecord recordJane = StructuredRecord.builder(inputSchema).set("person", "jane").build();
DataSetManager<Table> inputTable = getDataset("inputTable");
MockSource.writeInput(inputTable, ImmutableList.of(recordSamuel, recordBob, recordJane));
WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME).start();
workflowManager.waitForRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
Schema schema = Schema.recordOf("person", Schema.Field.of("person", Schema.of(Schema.Type.STRING)), Schema.Field.of("age", Schema.of(Schema.Type.STRING)));
Set<StructuredRecord> expected = new HashSet<>();
expected.add(StructuredRecord.builder(schema).set("person", "samuel").set("age", "12").build());
expected.add(StructuredRecord.builder(schema).set("person", "bob").set("age", "36").build());
expected.add(StructuredRecord.builder(schema).set("person", "jane").set("age", "25").build());
DataSetManager<Table> outputTable = getDataset("outputTable");
Set<StructuredRecord> actual = new HashSet<>(MockSink.readOutput(outputTable));
Assert.assertEquals(expected, actual);
validateMetric(3, appId, "source.records.out");
validateMetric(3, appId, "sink.records.in");
deleteDatasetInstance(NamespaceId.DEFAULT.dataset("inputTable"));
deleteDatasetInstance(NamespaceId.DEFAULT.dataset("outputTable"));
}
use of co.cask.cdap.api.dataset.lib.KeyValueTable in project cdap by caskdata.
the class DataPipelineTest method testSinglePhaseWithSparkSink.
private void testSinglePhaseWithSparkSink() throws Exception {
/*
* source1 ---|
* |--> sparksink
* source2 ---|
*/
ETLBatchConfig etlConfig = ETLBatchConfig.builder("* * * * *").addStage(new ETLStage("source1", MockSource.getPlugin("messages1", SpamMessage.SCHEMA))).addStage(new ETLStage("source2", MockSource.getPlugin("messages2", SpamMessage.SCHEMA))).addStage(new ETLStage("customsink", new ETLPlugin(NaiveBayesTrainer.PLUGIN_NAME, SparkSink.PLUGIN_TYPE, ImmutableMap.of("fileSetName", "modelFileSet", "path", "output", "fieldToClassify", SpamMessage.TEXT_FIELD, "predictionField", SpamMessage.SPAM_PREDICTION_FIELD), null))).addConnection("source1", "customsink").addConnection("source2", "customsink").build();
AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
ApplicationId appId = NamespaceId.DEFAULT.app("SparkSinkApp");
ApplicationManager appManager = deployApplication(appId, appRequest);
// set up five spam messages and five non-spam messages to be used for classification
List<StructuredRecord> messagesToWrite = new ArrayList<>();
messagesToWrite.add(new SpamMessage("buy our clothes", 1.0).toStructuredRecord());
messagesToWrite.add(new SpamMessage("sell your used books to us", 1.0).toStructuredRecord());
messagesToWrite.add(new SpamMessage("earn money for free", 1.0).toStructuredRecord());
messagesToWrite.add(new SpamMessage("this is definitely not spam", 1.0).toStructuredRecord());
messagesToWrite.add(new SpamMessage("you won the lottery", 1.0).toStructuredRecord());
// write records to source1
DataSetManager<Table> inputManager = getDataset(NamespaceId.DEFAULT.dataset("messages1"));
MockSource.writeInput(inputManager, messagesToWrite);
messagesToWrite.clear();
messagesToWrite.add(new SpamMessage("how was your day", 0.0).toStructuredRecord());
messagesToWrite.add(new SpamMessage("what are you up to", 0.0).toStructuredRecord());
messagesToWrite.add(new SpamMessage("this is a genuine message", 0.0).toStructuredRecord());
messagesToWrite.add(new SpamMessage("this is an even more genuine message", 0.0).toStructuredRecord());
messagesToWrite.add(new SpamMessage("could you send me the report", 0.0).toStructuredRecord());
// write records to source2
inputManager = getDataset(NamespaceId.DEFAULT.dataset("messages2"));
MockSource.writeInput(inputManager, messagesToWrite);
// ingest in some messages to be classified
StreamManager textsToClassify = getStreamManager(NaiveBayesTrainer.TEXTS_TO_CLASSIFY);
textsToClassify.send("how are you doing today");
textsToClassify.send("free money money");
textsToClassify.send("what are you doing today");
textsToClassify.send("genuine report");
// manually trigger the pipeline
WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
workflowManager.start();
workflowManager.waitForRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
DataSetManager<KeyValueTable> classifiedTexts = getDataset(NaiveBayesTrainer.CLASSIFIED_TEXTS);
Assert.assertEquals(0.0d, Bytes.toDouble(classifiedTexts.get().read("how are you doing today")), 0.01d);
// only 'free money money' should be predicated as spam
Assert.assertEquals(1.0d, Bytes.toDouble(classifiedTexts.get().read("free money money")), 0.01d);
Assert.assertEquals(0.0d, Bytes.toDouble(classifiedTexts.get().read("what are you doing today")), 0.01d);
Assert.assertEquals(0.0d, Bytes.toDouble(classifiedTexts.get().read("genuine report")), 0.01d);
validateMetric(5, appId, "source1.records.out");
validateMetric(5, appId, "source2.records.out");
validateMetric(10, appId, "customsink.records.in");
}
use of co.cask.cdap.api.dataset.lib.KeyValueTable in project cdap by caskdata.
the class MultiConsumerTest method testMulti.
@Test
public void testMulti() throws Exception {
// TODO: Fix this test case to really test with numGroups settings.
final ApplicationWithPrograms app = AppFabricTestHelper.deployApplicationWithManager(MultiApp.class, TEMP_FOLDER_SUPPLIER);
List<ProgramController> controllers = Lists.newArrayList();
for (ProgramDescriptor programDescriptor : app.getPrograms()) {
controllers.add(AppFabricTestHelper.submit(app, programDescriptor.getSpecification().getClassName(), new BasicArguments(), TEMP_FOLDER_SUPPLIER));
}
DatasetFramework datasetFramework = AppFabricTestHelper.getInjector().getInstance(DatasetFramework.class);
DynamicDatasetCache datasetCache = new SingleThreadDatasetCache(new SystemDatasetInstantiator(datasetFramework, getClass().getClassLoader(), null), AppFabricTestHelper.getInjector().getInstance(TransactionSystemClient.class), NamespaceId.DEFAULT, DatasetDefinition.NO_ARGUMENTS, null, null);
final KeyValueTable accumulated = datasetCache.getDataset("accumulated");
TransactionExecutorFactory txExecutorFactory = AppFabricTestHelper.getInjector().getInstance(TransactionExecutorFactory.class);
// Try to get accumulated result and verify it. Expect result appear in max of 60 seconds.
int trial = 0;
while (trial < 60) {
try {
Transactions.createTransactionExecutor(txExecutorFactory, accumulated).execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() throws Exception {
byte[] value = accumulated.read(MultiApp.KEY);
// Sum(1..100) * 3
Assert.assertEquals(((1 + 99) * 99 / 2) * 3, Longs.fromByteArray(value));
}
});
break;
} catch (TransactionFailureException e) {
// No-op
trial++;
TimeUnit.SECONDS.sleep(1);
}
}
Assert.assertTrue(trial < 60);
for (ProgramController controller : controllers) {
controller.stop().get();
}
}
use of co.cask.cdap.api.dataset.lib.KeyValueTable in project cdap by caskdata.
the class PipelineTest method testWordCountSparkSink.
@SuppressWarnings("ConstantConditions")
@Test
public void testWordCountSparkSink() throws Exception {
String inputName = "sparkSinkInput";
String outputName = "sparkSinkOutput";
// create the pipeline config
ETLStage source = new ETLStage("source", MockSource.getPlugin(inputName));
Map<String, String> sinkProperties = new HashMap<>();
sinkProperties.put("field", "text");
sinkProperties.put("tableName", outputName);
ETLStage sink = new ETLStage("sink", new ETLPlugin(WordCountSink.NAME, SparkSink.PLUGIN_TYPE, sinkProperties, null));
ETLBatchConfig pipelineConfig = ETLBatchConfig.builder("* * * * *").addStage(source).addStage(sink).addConnection(source.getName(), sink.getName()).build();
// create the pipeline
ApplicationId pipelineId = NamespaceId.DEFAULT.app("sparkSinkTestPipeline");
ApplicationManager appManager = deployApplication(pipelineId, new AppRequest<>(APP_ARTIFACT, pipelineConfig));
// write the input
Schema inputSchema = Schema.recordOf("text", Schema.Field.of("text", Schema.of(Schema.Type.STRING)));
DataSetManager<Table> inputManager = getDataset(inputName);
List<StructuredRecord> inputRecords = new ArrayList<>();
inputRecords.add(StructuredRecord.builder(inputSchema).set("text", "Hello World").build());
inputRecords.add(StructuredRecord.builder(inputSchema).set("text", "Hello my name is Hal").build());
inputRecords.add(StructuredRecord.builder(inputSchema).set("text", "Hello my name is Sam").build());
MockSource.writeInput(inputManager, inputRecords);
WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
workflowManager.start();
workflowManager.waitForFinish(4, TimeUnit.MINUTES);
DataSetManager<KeyValueTable> outputManager = getDataset(outputName);
KeyValueTable output = outputManager.get();
Assert.assertEquals(3L, Bytes.toLong(output.read("Hello")));
Assert.assertEquals(1L, Bytes.toLong(output.read("World")));
Assert.assertEquals(2L, Bytes.toLong(output.read("my")));
Assert.assertEquals(2L, Bytes.toLong(output.read("name")));
Assert.assertEquals(2L, Bytes.toLong(output.read("is")));
Assert.assertEquals(1L, Bytes.toLong(output.read("Hal")));
Assert.assertEquals(1L, Bytes.toLong(output.read("Sam")));
}
Aggregations