use of io.cdap.cdap.api.dataset.lib.KeyValueTable in project cdap by cdapio.
the class SparkTest method testSparkWithGetDataset.
private void testSparkWithGetDataset(Class<? extends Application> appClass, String sparkProgram) throws Exception {
ApplicationManager applicationManager = deploy(appClass);
DataSetManager<FileSet> filesetManager = getDataset("logs");
FileSet fileset = filesetManager.get();
Location location = fileset.getLocation("nn");
prepareInputFileSetWithLogData(location);
Map<String, String> inputArgs = new HashMap<>();
FileSetArguments.setInputPath(inputArgs, "nn");
Map<String, String> args = new HashMap<>();
args.putAll(RuntimeArguments.addScope(Scope.DATASET, "logs", inputArgs));
args.put("input", "logs");
args.put("output", "logStats");
SparkManager sparkManager = applicationManager.getSparkManager(sparkProgram);
sparkManager.startAndWaitForGoodRun(args, ProgramRunStatus.COMPLETED, 2, TimeUnit.MINUTES);
DataSetManager<KeyValueTable> logStatsManager = getDataset("logStats");
KeyValueTable logStatsTable = logStatsManager.get();
validateGetDatasetOutput(logStatsTable);
// Cleanup after run
location.delete(true);
logStatsManager.flush();
try (CloseableIterator<KeyValue<byte[], byte[]>> scan = logStatsTable.scan(null, null)) {
while (scan.hasNext()) {
logStatsTable.delete(scan.next().getKey());
}
}
logStatsManager.flush();
}
use of io.cdap.cdap.api.dataset.lib.KeyValueTable in project cdap by cdapio.
the class SparkTest method testClassicSpark.
@Test
public void testClassicSpark() throws Exception {
ApplicationManager appManager = deploy(TestSparkApp.class);
for (Class<?> sparkClass : Arrays.asList(TestSparkApp.ClassicSpark.class, TestSparkApp.ScalaClassicSpark.class)) {
final SparkManager sparkManager = appManager.getSparkManager(sparkClass.getSimpleName());
sparkManager.startAndWaitForGoodRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
}
KeyValueTable resultTable = this.<KeyValueTable>getDataset("ResultTable").get();
Assert.assertEquals(1L, Bytes.toLong(resultTable.read(ClassicSparkProgram.class.getName())));
Assert.assertEquals(1L, Bytes.toLong(resultTable.read(ScalaClassicSparkProgram.class.getName())));
}
use of io.cdap.cdap.api.dataset.lib.KeyValueTable in project cdap by cdapio.
the class SparkTest method testTransaction.
@Test
public void testTransaction() throws Exception {
ApplicationManager applicationManager = deploy(TestSparkApp.class);
// Write some data to a local file
File inputFile = TEMP_FOLDER.newFile();
try (PrintWriter writer = new PrintWriter(Files.newBufferedWriter(inputFile.toPath(), StandardCharsets.UTF_8))) {
writer.println("red fox");
writer.println("brown fox");
writer.println("grey fox");
writer.println("brown bear");
writer.println("black bear");
}
// Run the spark program
SparkManager sparkManager = applicationManager.getSparkManager(TransactionSpark.class.getSimpleName());
sparkManager.start(ImmutableMap.of("input.file", inputFile.getAbsolutePath(), "keyvalue.table", "KeyValueTable", "result.all.dataset", "SparkResult", "result.threshold", "2", "result.threshold.dataset", "SparkThresholdResult"));
// Verify result from dataset before the Spark program terminates
final DataSetManager<KeyValueTable> resultManager = getDataset("SparkThresholdResult");
final KeyValueTable resultTable = resultManager.get();
// Expect the threshold result dataset, with threshold >=2, contains [brown, fox, bear]
Tasks.waitFor(ImmutableSet.of("brown", "fox", "bear"), () -> {
// This is to start a new TX
resultManager.flush();
LOG.info("Reading from threshold result");
try (CloseableIterator<KeyValue<byte[], byte[]>> itor = resultTable.scan(null, null)) {
return ImmutableSet.copyOf(Iterators.transform(itor, input -> {
String word = Bytes.toString(input.getKey());
LOG.info("{}, {}", word, Bytes.toInt(input.getValue()));
return word;
}));
}
}, 3, TimeUnit.MINUTES, 1, TimeUnit.SECONDS);
sparkManager.stop();
sparkManager.waitForRun(ProgramRunStatus.KILLED, 60, TimeUnit.SECONDS);
}
use of io.cdap.cdap.api.dataset.lib.KeyValueTable in project cdap by cdapio.
the class TestFrameworkTestRun method executeWorkflow.
private String executeWorkflow(ApplicationManager applicationManager, Map<String, String> additionalParams, int expectedComplete) throws Exception {
WorkflowManager wfManager = applicationManager.getWorkflowManager(WorkflowAppWithLocalDatasets.WORKFLOW_NAME);
Map<String, String> runtimeArgs = new HashMap<>();
File waitFile = new File(TMP_FOLDER.newFolder(), "/wait.file");
File doneFile = new File(TMP_FOLDER.newFolder(), "/done.file");
runtimeArgs.put("input.path", "input");
runtimeArgs.put("output.path", "output");
runtimeArgs.put("wait.file", waitFile.getAbsolutePath());
runtimeArgs.put("done.file", doneFile.getAbsolutePath());
runtimeArgs.putAll(additionalParams);
wfManager.start(runtimeArgs);
// Wait until custom action in the Workflow is triggered.
while (!waitFile.exists()) {
TimeUnit.MILLISECONDS.sleep(50);
}
// Now the Workflow should have RUNNING status. Get its runid.
List<RunRecord> history = wfManager.getHistory(ProgramRunStatus.RUNNING);
Assert.assertEquals(1, history.size());
String runId = history.get(0).getPid();
// Get the local datasets for this Workflow run
DataSetManager<KeyValueTable> localDataset = getDataset(testSpace.dataset(WorkflowAppWithLocalDatasets.WORDCOUNT_DATASET + "." + runId));
Assert.assertEquals("2", Bytes.toString(localDataset.get().read("text")));
DataSetManager<FileSet> fileSetDataset = getDataset(testSpace.dataset(WorkflowAppWithLocalDatasets.CSV_FILESET_DATASET + "." + runId));
Assert.assertNotNull(fileSetDataset.get());
// Local datasets should not exist at the namespace level
localDataset = getDataset(testSpace.dataset(WorkflowAppWithLocalDatasets.WORDCOUNT_DATASET));
Assert.assertNull(localDataset.get());
fileSetDataset = getDataset(testSpace.dataset(WorkflowAppWithLocalDatasets.CSV_FILESET_DATASET));
Assert.assertNull(fileSetDataset.get());
// Verify that the workflow hasn't completed on its own before we signal it to
history = wfManager.getHistory(ProgramRunStatus.RUNNING);
Assert.assertEquals(1, history.size());
// Signal the Workflow to continue
doneFile.createNewFile();
// Wait for workflow to finish
wfManager.waitForRuns(ProgramRunStatus.COMPLETED, expectedComplete, 1, TimeUnit.MINUTES);
Map<String, WorkflowNodeStateDetail> nodeStateDetailMap = wfManager.getWorkflowNodeStates(runId);
Map<String, String> workflowMetricsContext = new HashMap<>();
workflowMetricsContext.put(Constants.Metrics.Tag.NAMESPACE, testSpace.getNamespace());
workflowMetricsContext.put(Constants.Metrics.Tag.APP, applicationManager.getInfo().getName());
workflowMetricsContext.put(Constants.Metrics.Tag.WORKFLOW, WorkflowAppWithLocalDatasets.WORKFLOW_NAME);
workflowMetricsContext.put(Constants.Metrics.Tag.RUN_ID, runId);
Map<String, String> writerContext = new HashMap<>(workflowMetricsContext);
writerContext.put(Constants.Metrics.Tag.NODE, WorkflowAppWithLocalDatasets.LocalDatasetWriter.class.getSimpleName());
Assert.assertEquals(2, getMetricsManager().getTotalMetric(writerContext, "user.num.lines"));
Map<String, String> wfSparkMetricsContext = new HashMap<>(workflowMetricsContext);
wfSparkMetricsContext.put(Constants.Metrics.Tag.NODE, "JavaSparkCSVToSpaceConverter");
Assert.assertEquals(2, getMetricsManager().getTotalMetric(wfSparkMetricsContext, "user.num.lines"));
// check in spark context
Map<String, String> sparkMetricsContext = new HashMap<>();
sparkMetricsContext.put(Constants.Metrics.Tag.NAMESPACE, testSpace.getNamespace());
sparkMetricsContext.put(Constants.Metrics.Tag.APP, applicationManager.getInfo().getName());
sparkMetricsContext.put(Constants.Metrics.Tag.SPARK, "JavaSparkCSVToSpaceConverter");
sparkMetricsContext.put(Constants.Metrics.Tag.RUN_ID, nodeStateDetailMap.get("JavaSparkCSVToSpaceConverter").getRunId());
Assert.assertEquals(2, getMetricsManager().getTotalMetric(sparkMetricsContext, "user.num.lines"));
Map<String, String> appMetricsContext = new HashMap<>();
appMetricsContext.put(Constants.Metrics.Tag.NAMESPACE, testSpace.getNamespace());
appMetricsContext.put(Constants.Metrics.Tag.APP, applicationManager.getInfo().getName());
// app metrics context should have sum from custom action and spark metrics.
Assert.assertEquals(4, getMetricsManager().getTotalMetric(appMetricsContext, "user.num.lines"));
Map<String, String> wfMRMetricsContext = new HashMap<>(workflowMetricsContext);
wfMRMetricsContext.put(Constants.Metrics.Tag.NODE, "WordCount");
Assert.assertEquals(7, getMetricsManager().getTotalMetric(wfMRMetricsContext, "user.num.words"));
// mr metrics context
Map<String, String> mrMetricsContext = new HashMap<>();
mrMetricsContext.put(Constants.Metrics.Tag.NAMESPACE, testSpace.getNamespace());
mrMetricsContext.put(Constants.Metrics.Tag.APP, applicationManager.getInfo().getName());
mrMetricsContext.put(Constants.Metrics.Tag.MAPREDUCE, "WordCount");
mrMetricsContext.put(Constants.Metrics.Tag.RUN_ID, nodeStateDetailMap.get("WordCount").getRunId());
Assert.assertEquals(7, getMetricsManager().getTotalMetric(mrMetricsContext, "user.num.words"));
final Map<String, String> readerContext = new HashMap<>(workflowMetricsContext);
readerContext.put(Constants.Metrics.Tag.NODE, "readerAction");
Tasks.waitFor(6L, new Callable<Long>() {
@Override
public Long call() throws Exception {
return getMetricsManager().getTotalMetric(readerContext, "user.unique.words");
}
}, 60, TimeUnit.SECONDS);
return runId;
}
use of io.cdap.cdap.api.dataset.lib.KeyValueTable in project cdap by cdapio.
the class SparkLogParser method run.
@Override
public void run(JavaSparkExecutionContext sec) throws Exception {
JavaSparkContext jsc = new JavaSparkContext();
Map<String, String> runtimeArguments = sec.getRuntimeArguments();
String inputFileSet = runtimeArguments.get("input");
final String outputTable = runtimeArguments.get("output");
JavaPairRDD<LongWritable, Text> input = sec.fromDataset(inputFileSet);
final JavaPairRDD<String, String> aggregated = input.mapToPair(new PairFunction<Tuple2<LongWritable, Text>, LogKey, LogStats>() {
@Override
public Tuple2<LogKey, LogStats> call(Tuple2<LongWritable, Text> input) throws Exception {
return SparkAppUsingGetDataset.parse(input._2());
}
}).reduceByKey(new Function2<LogStats, LogStats, LogStats>() {
@Override
public LogStats call(LogStats stats1, LogStats stats2) throws Exception {
return stats1.aggregate(stats2);
}
}).mapPartitionsToPair(new PairFlatMapFunction<Iterator<Tuple2<LogKey, LogStats>>, String, String>() {
@Override
public Iterator<Tuple2<String, String>> call(Iterator<Tuple2<LogKey, LogStats>> itor) throws Exception {
final Gson gson = new Gson();
return Lists.newArrayList(Iterators.transform(itor, new Function<Tuple2<LogKey, LogStats>, Tuple2<String, String>>() {
@Override
public Tuple2<String, String> apply(Tuple2<LogKey, LogStats> input) {
return new Tuple2<>(gson.toJson(input._1()), gson.toJson(input._2()));
}
})).iterator();
}
});
// Collect all data to driver and write to dataset directly. That's the intend of the test.
sec.execute(new TxRunnable() {
@Override
public void run(DatasetContext context) throws Exception {
KeyValueTable kvTable = context.getDataset(outputTable);
for (Map.Entry<String, String> entry : aggregated.collectAsMap().entrySet()) {
kvTable.write(entry.getKey(), entry.getValue());
}
}
});
}
Aggregations