use of co.cask.cdap.api.dataset.lib.FileSet in project cdap by caskdata.
the class FileSetTest method testAbsolutePath.
@Test
public void testAbsolutePath() throws IOException, DatasetManagementException {
String absolutePath = tmpFolder.newFolder() + "/absolute/path";
dsFrameworkUtil.createInstance("fileSet", testFileSetInstance3, FileSetProperties.builder().setBasePath(absolutePath).build());
// validate that the base path for the file set was created
Assert.assertTrue(new File(absolutePath).isDirectory());
// instantiate the file set with an output path
Map<String, String> fileArgs = Maps.newHashMap();
FileSetArguments.setOutputPath(fileArgs, "out");
FileSet fileSet = dsFrameworkUtil.getInstance(testFileSetInstance3, fileArgs);
// write to the output path
Assert.assertEquals(absolutePath + "/out", fileSet.getOutputLocation().toURI().getPath());
try (OutputStream out = fileSet.getOutputLocation().getOutputStream()) {
out.write(42);
}
// validate that the file was created
Assert.assertTrue(new File(absolutePath + "/out").isFile());
}
use of co.cask.cdap.api.dataset.lib.FileSet in project cdap by caskdata.
the class TestFrameworkTestRun method executeWorkflow.
private String executeWorkflow(ApplicationManager applicationManager, Map<String, String> additionalParams, int expectedComplete) throws Exception {
WorkflowManager wfManager = applicationManager.getWorkflowManager(WorkflowAppWithLocalDatasets.WORKFLOW_NAME);
Map<String, String> runtimeArgs = new HashMap<>();
File waitFile = new File(TMP_FOLDER.newFolder(), "/wait.file");
File doneFile = new File(TMP_FOLDER.newFolder(), "/done.file");
runtimeArgs.put("input.path", "input");
runtimeArgs.put("output.path", "output");
runtimeArgs.put("wait.file", waitFile.getAbsolutePath());
runtimeArgs.put("done.file", doneFile.getAbsolutePath());
runtimeArgs.putAll(additionalParams);
wfManager.start(runtimeArgs);
// Wait until custom action in the Workflow is triggered.
while (!waitFile.exists()) {
TimeUnit.MILLISECONDS.sleep(50);
}
// Now the Workflow should have RUNNING status. Get its runid.
List<RunRecord> history = wfManager.getHistory(ProgramRunStatus.RUNNING);
Assert.assertEquals(1, history.size());
String runId = history.get(0).getPid();
// Get the local datasets for this Workflow run
DataSetManager<KeyValueTable> localDataset = getDataset(testSpace.dataset(WorkflowAppWithLocalDatasets.WORDCOUNT_DATASET + "." + runId));
Assert.assertEquals("2", Bytes.toString(localDataset.get().read("text")));
DataSetManager<FileSet> fileSetDataset = getDataset(testSpace.dataset(WorkflowAppWithLocalDatasets.CSV_FILESET_DATASET + "." + runId));
Assert.assertNotNull(fileSetDataset.get());
// Local datasets should not exist at the namespace level
localDataset = getDataset(testSpace.dataset(WorkflowAppWithLocalDatasets.WORDCOUNT_DATASET));
Assert.assertNull(localDataset.get());
fileSetDataset = getDataset(testSpace.dataset(WorkflowAppWithLocalDatasets.CSV_FILESET_DATASET));
Assert.assertNull(fileSetDataset.get());
// Verify that the workflow hasn't completed on its own before we signal it to
history = wfManager.getHistory(ProgramRunStatus.RUNNING);
Assert.assertEquals(1, history.size());
// Signal the Workflow to continue
doneFile.createNewFile();
// Wait for workflow to finish
wfManager.waitForRuns(ProgramRunStatus.COMPLETED, expectedComplete, 1, TimeUnit.MINUTES);
Map<String, WorkflowNodeStateDetail> nodeStateDetailMap = wfManager.getWorkflowNodeStates(runId);
Map<String, String> workflowMetricsContext = new HashMap<>();
workflowMetricsContext.put(Constants.Metrics.Tag.NAMESPACE, testSpace.getNamespace());
workflowMetricsContext.put(Constants.Metrics.Tag.APP, applicationManager.getInfo().getName());
workflowMetricsContext.put(Constants.Metrics.Tag.WORKFLOW, WorkflowAppWithLocalDatasets.WORKFLOW_NAME);
workflowMetricsContext.put(Constants.Metrics.Tag.RUN_ID, runId);
Map<String, String> writerContext = new HashMap<>(workflowMetricsContext);
writerContext.put(Constants.Metrics.Tag.NODE, WorkflowAppWithLocalDatasets.LocalDatasetWriter.class.getSimpleName());
Assert.assertEquals(2, getMetricsManager().getTotalMetric(writerContext, "user.num.lines"));
Map<String, String> wfSparkMetricsContext = new HashMap<>(workflowMetricsContext);
wfSparkMetricsContext.put(Constants.Metrics.Tag.NODE, "JavaSparkCSVToSpaceConverter");
Assert.assertEquals(2, getMetricsManager().getTotalMetric(wfSparkMetricsContext, "user.num.lines"));
// check in spark context
Map<String, String> sparkMetricsContext = new HashMap<>();
sparkMetricsContext.put(Constants.Metrics.Tag.NAMESPACE, testSpace.getNamespace());
sparkMetricsContext.put(Constants.Metrics.Tag.APP, applicationManager.getInfo().getName());
sparkMetricsContext.put(Constants.Metrics.Tag.SPARK, "JavaSparkCSVToSpaceConverter");
sparkMetricsContext.put(Constants.Metrics.Tag.RUN_ID, nodeStateDetailMap.get("JavaSparkCSVToSpaceConverter").getRunId());
Assert.assertEquals(2, getMetricsManager().getTotalMetric(sparkMetricsContext, "user.num.lines"));
Map<String, String> appMetricsContext = new HashMap<>();
appMetricsContext.put(Constants.Metrics.Tag.NAMESPACE, testSpace.getNamespace());
appMetricsContext.put(Constants.Metrics.Tag.APP, applicationManager.getInfo().getName());
// app metrics context should have sum from custom action and spark metrics.
Assert.assertEquals(4, getMetricsManager().getTotalMetric(appMetricsContext, "user.num.lines"));
Map<String, String> wfMRMetricsContext = new HashMap<>(workflowMetricsContext);
wfMRMetricsContext.put(Constants.Metrics.Tag.NODE, "WordCount");
Assert.assertEquals(7, getMetricsManager().getTotalMetric(wfMRMetricsContext, "user.num.words"));
// mr metrics context
Map<String, String> mrMetricsContext = new HashMap<>();
mrMetricsContext.put(Constants.Metrics.Tag.NAMESPACE, testSpace.getNamespace());
mrMetricsContext.put(Constants.Metrics.Tag.APP, applicationManager.getInfo().getName());
mrMetricsContext.put(Constants.Metrics.Tag.MAPREDUCE, "WordCount");
mrMetricsContext.put(Constants.Metrics.Tag.RUN_ID, nodeStateDetailMap.get("WordCount").getRunId());
Assert.assertEquals(7, getMetricsManager().getTotalMetric(mrMetricsContext, "user.num.words"));
final Map<String, String> readerContext = new HashMap<>(workflowMetricsContext);
readerContext.put(Constants.Metrics.Tag.NODE, "readerAction");
Tasks.waitFor(6L, new Callable<Long>() {
@Override
public Long call() throws Exception {
return getMetricsManager().getTotalMetric(readerContext, "user.unique.words");
}
}, 60, TimeUnit.SECONDS);
return runId;
}
use of co.cask.cdap.api.dataset.lib.FileSet in project cdap by caskdata.
the class TestFrameworkTestRun method testClusterName.
@Test
public void testClusterName() throws Exception {
String clusterName = getConfiguration().get(Constants.CLUSTER_NAME);
ApplicationManager appManager = deployApplication(ClusterNameTestApp.class);
final DataSetManager<KeyValueTable> datasetManager = getDataset(ClusterNameTestApp.CLUSTER_NAME_TABLE);
final KeyValueTable clusterNameTable = datasetManager.get();
// A callable for reading the cluster name from the ClusterNameTable.
// It is used for Tasks.waitFor call down below.
final AtomicReference<String> key = new AtomicReference<>();
Callable<String> readClusterName = new Callable<String>() {
@Nullable
@Override
public String call() throws Exception {
datasetManager.flush();
byte[] bytes = clusterNameTable.read(key.get());
return bytes == null ? null : new String(bytes, StandardCharsets.UTF_8);
}
};
// Service
ServiceManager serviceManager = appManager.getServiceManager(ClusterNameTestApp.ClusterNameServiceHandler.class.getSimpleName()).start();
Assert.assertEquals(clusterName, callServiceGet(serviceManager.getServiceURL(10, TimeUnit.SECONDS), "clusterName"));
serviceManager.stop();
// Worker
WorkerManager workerManager = appManager.getWorkerManager(ClusterNameTestApp.ClusterNameWorker.class.getSimpleName()).start();
key.set("worker.cluster.name");
Tasks.waitFor(clusterName, readClusterName, 10, TimeUnit.SECONDS, 100, TimeUnit.MILLISECONDS);
// The worker will stop by itself. No need to call stop
workerManager.waitForRun(ProgramRunStatus.COMPLETED, 10, TimeUnit.SECONDS);
// Flow
FlowManager flowManager = appManager.getFlowManager(ClusterNameTestApp.ClusterNameFlow.class.getSimpleName()).start();
key.set("flow.cluster.name");
Tasks.waitFor(clusterName, readClusterName, 10, TimeUnit.SECONDS, 100, TimeUnit.MILLISECONDS);
flowManager.stop();
// MapReduce
// Setup the input file used by MR
Location location = this.<FileSet>getDataset(ClusterNameTestApp.INPUT_FILE_SET).get().getLocation("input");
try (PrintStream printer = new PrintStream(location.getOutputStream(), true, "UTF-8")) {
for (int i = 0; i < 10; i++) {
printer.println("Hello World " + i);
}
}
// Setup input and output dataset arguments
Map<String, String> inputArgs = new HashMap<>();
FileSetArguments.setInputPath(inputArgs, "input");
Map<String, String> outputArgs = new HashMap<>();
FileSetArguments.setOutputPath(outputArgs, "output");
Map<String, String> args = new HashMap<>();
args.putAll(RuntimeArguments.addScope(Scope.DATASET, ClusterNameTestApp.INPUT_FILE_SET, inputArgs));
args.putAll(RuntimeArguments.addScope(Scope.DATASET, ClusterNameTestApp.OUTPUT_FILE_SET, outputArgs));
MapReduceManager mrManager = appManager.getMapReduceManager(ClusterNameTestApp.ClusterNameMapReduce.class.getSimpleName()).start(args);
key.set("mr.client.cluster.name");
Tasks.waitFor(clusterName, readClusterName, 10, TimeUnit.SECONDS, 100, TimeUnit.MILLISECONDS);
key.set("mapper.cluster.name");
Tasks.waitFor(clusterName, readClusterName, 10, TimeUnit.SECONDS, 100, TimeUnit.MILLISECONDS);
key.set("reducer.cluster.name");
Tasks.waitFor(clusterName, readClusterName, 10, TimeUnit.SECONDS, 100, TimeUnit.MILLISECONDS);
mrManager.waitForRun(ProgramRunStatus.COMPLETED, 60, TimeUnit.SECONDS);
// Spark
SparkManager sparkManager = appManager.getSparkManager(ClusterNameTestApp.ClusterNameSpark.class.getSimpleName()).start();
key.set("spark.cluster.name");
Tasks.waitFor(clusterName, readClusterName, 10, TimeUnit.SECONDS, 100, TimeUnit.MILLISECONDS);
sparkManager.waitForRun(ProgramRunStatus.COMPLETED, 60, TimeUnit.SECONDS);
// Workflow
// Cleanup the output path for the MR job in the workflow first
this.<FileSet>getDataset(ClusterNameTestApp.OUTPUT_FILE_SET).get().getLocation("output").delete(true);
args = RuntimeArguments.addScope(Scope.MAPREDUCE, ClusterNameTestApp.ClusterNameMapReduce.class.getSimpleName(), args);
WorkflowManager workflowManager = appManager.getWorkflowManager(ClusterNameTestApp.ClusterNameWorkflow.class.getSimpleName()).start(args);
String prefix = ClusterNameTestApp.ClusterNameWorkflow.class.getSimpleName() + ".";
key.set(prefix + "mr.client.cluster.name");
Tasks.waitFor(clusterName, readClusterName, 10, TimeUnit.SECONDS, 100, TimeUnit.MILLISECONDS);
key.set(prefix + "mapper.cluster.name");
Tasks.waitFor(clusterName, readClusterName, 10, TimeUnit.SECONDS, 100, TimeUnit.MILLISECONDS);
key.set(prefix + "reducer.cluster.name");
Tasks.waitFor(clusterName, readClusterName, 10, TimeUnit.SECONDS, 100, TimeUnit.MILLISECONDS);
key.set(prefix + "spark.cluster.name");
Tasks.waitFor(clusterName, readClusterName, 10, TimeUnit.SECONDS, 100, TimeUnit.MILLISECONDS);
key.set(prefix + "action.cluster.name");
Tasks.waitFor(clusterName, readClusterName, 10, TimeUnit.SECONDS, 100, TimeUnit.MILLISECONDS);
workflowManager.waitForRun(ProgramRunStatus.COMPLETED, 120, TimeUnit.SECONDS);
}
use of co.cask.cdap.api.dataset.lib.FileSet in project cdap by caskdata.
the class SparkFileSetTestRun method testSparkWithFileSet.
private void testSparkWithFileSet(ApplicationManager applicationManager, String sparkProgram) throws Exception {
DataSetManager<FileSet> filesetManager = getDataset("fs");
final FileSet fileset = filesetManager.get();
Location location = fileset.getLocation("nn");
prepareFileInput(location);
Map<String, String> inputArgs = new HashMap<>();
FileSetArguments.setInputPath(inputArgs, "nn");
Map<String, String> outputArgs = new HashMap<>();
FileSetArguments.setOutputPath(inputArgs, "xx");
Map<String, String> args = new HashMap<>();
args.putAll(RuntimeArguments.addScope(Scope.DATASET, "fs", inputArgs));
args.putAll(RuntimeArguments.addScope(Scope.DATASET, "fs", outputArgs));
args.put("input", "fs");
args.put("output", "fs");
SparkManager sparkManager = applicationManager.getSparkManager(sparkProgram).start(args);
sparkManager.waitForRun(ProgramRunStatus.COMPLETED, 1, TimeUnit.MINUTES);
validateFileOutput(fileset.getLocation("xx"), "custom:");
// Cleanup paths after running test
fileset.getLocation("nn").delete(true);
fileset.getLocation("xx").delete(true);
}
use of co.cask.cdap.api.dataset.lib.FileSet in project cdap by caskdata.
the class SparkTestRun method testSparkWithGetDataset.
private void testSparkWithGetDataset(Class<? extends Application> appClass, String sparkProgram) throws Exception {
ApplicationManager applicationManager = deploy(appClass);
DataSetManager<FileSet> filesetManager = getDataset("logs");
FileSet fileset = filesetManager.get();
Location location = fileset.getLocation("nn");
prepareInputFileSetWithLogData(location);
Map<String, String> inputArgs = new HashMap<>();
FileSetArguments.setInputPath(inputArgs, "nn");
Map<String, String> args = new HashMap<>();
args.putAll(RuntimeArguments.addScope(Scope.DATASET, "logs", inputArgs));
args.put("input", "logs");
args.put("output", "logStats");
SparkManager sparkManager = applicationManager.getSparkManager(sparkProgram).start(args);
sparkManager.waitForRun(ProgramRunStatus.COMPLETED, 2, TimeUnit.MINUTES);
DataSetManager<KeyValueTable> logStatsManager = getDataset("logStats");
KeyValueTable logStatsTable = logStatsManager.get();
validateGetDatasetOutput(logStatsTable);
// Cleanup after run
location.delete(true);
logStatsManager.flush();
try (CloseableIterator<KeyValue<byte[], byte[]>> scan = logStatsTable.scan(null, null)) {
while (scan.hasNext()) {
logStatsTable.delete(scan.next().getKey());
}
}
logStatsManager.flush();
}
Aggregations