Examples with FileSet - co.cask.cdap.api.dataset.lib.FileSet

Example 11 with FileSet

use of co.cask.cdap.api.dataset.lib.FileSet in project cdap by caskdata.

the class FileSetTest method testAbsolutePath.

@Test
public void testAbsolutePath() throws IOException, DatasetManagementException {
    String absolutePath = tmpFolder.newFolder() + "/absolute/path";
    dsFrameworkUtil.createInstance("fileSet", testFileSetInstance3, FileSetProperties.builder().setBasePath(absolutePath).build());
    // validate that the base path for the file set was created
    Assert.assertTrue(new File(absolutePath).isDirectory());
    // instantiate the file set with an output path
    Map<String, String> fileArgs = Maps.newHashMap();
    FileSetArguments.setOutputPath(fileArgs, "out");
    FileSet fileSet = dsFrameworkUtil.getInstance(testFileSetInstance3, fileArgs);
    // write to the output path
    Assert.assertEquals(absolutePath + "/out", fileSet.getOutputLocation().toURI().getPath());
    try (OutputStream out = fileSet.getOutputLocation().getOutputStream()) {
        out.write(42);
    }
    // validate that the file was created
    Assert.assertTrue(new File(absolutePath + "/out").isFile());
}

Also used : FileSet(co.cask.cdap.api.dataset.lib.FileSet) OutputStream(java.io.OutputStream) File(java.io.File) Test(org.junit.Test)

Example 12 with FileSet

use of co.cask.cdap.api.dataset.lib.FileSet in project cdap by caskdata.

the class TestFrameworkTestRun method executeWorkflow.

private String executeWorkflow(ApplicationManager applicationManager, Map<String, String> additionalParams, int expectedComplete) throws Exception {
    WorkflowManager wfManager = applicationManager.getWorkflowManager(WorkflowAppWithLocalDatasets.WORKFLOW_NAME);
    Map<String, String> runtimeArgs = new HashMap<>();
    File waitFile = new File(TMP_FOLDER.newFolder(), "/wait.file");
    File doneFile = new File(TMP_FOLDER.newFolder(), "/done.file");
    runtimeArgs.put("input.path", "input");
    runtimeArgs.put("output.path", "output");
    runtimeArgs.put("wait.file", waitFile.getAbsolutePath());
    runtimeArgs.put("done.file", doneFile.getAbsolutePath());
    runtimeArgs.putAll(additionalParams);
    wfManager.start(runtimeArgs);
    // Wait until custom action in the Workflow is triggered.
    while (!waitFile.exists()) {
        TimeUnit.MILLISECONDS.sleep(50);
    }
    // Now the Workflow should have RUNNING status. Get its runid.
    List<RunRecord> history = wfManager.getHistory(ProgramRunStatus.RUNNING);
    Assert.assertEquals(1, history.size());
    String runId = history.get(0).getPid();
    // Get the local datasets for this Workflow run
    DataSetManager<KeyValueTable> localDataset = getDataset(testSpace.dataset(WorkflowAppWithLocalDatasets.WORDCOUNT_DATASET + "." + runId));
    Assert.assertEquals("2", Bytes.toString(localDataset.get().read("text")));
    DataSetManager<FileSet> fileSetDataset = getDataset(testSpace.dataset(WorkflowAppWithLocalDatasets.CSV_FILESET_DATASET + "." + runId));
    Assert.assertNotNull(fileSetDataset.get());
    // Local datasets should not exist at the namespace level
    localDataset = getDataset(testSpace.dataset(WorkflowAppWithLocalDatasets.WORDCOUNT_DATASET));
    Assert.assertNull(localDataset.get());
    fileSetDataset = getDataset(testSpace.dataset(WorkflowAppWithLocalDatasets.CSV_FILESET_DATASET));
    Assert.assertNull(fileSetDataset.get());
    // Verify that the workflow hasn't completed on its own before we signal it to
    history = wfManager.getHistory(ProgramRunStatus.RUNNING);
    Assert.assertEquals(1, history.size());
    // Signal the Workflow to continue
    doneFile.createNewFile();
    // Wait for workflow to finish
    wfManager.waitForRuns(ProgramRunStatus.COMPLETED, expectedComplete, 1, TimeUnit.MINUTES);
    Map<String, WorkflowNodeStateDetail> nodeStateDetailMap = wfManager.getWorkflowNodeStates(runId);
    Map<String, String> workflowMetricsContext = new HashMap<>();
    workflowMetricsContext.put(Constants.Metrics.Tag.NAMESPACE, testSpace.getNamespace());
    workflowMetricsContext.put(Constants.Metrics.Tag.APP, applicationManager.getInfo().getName());
    workflowMetricsContext.put(Constants.Metrics.Tag.WORKFLOW, WorkflowAppWithLocalDatasets.WORKFLOW_NAME);
    workflowMetricsContext.put(Constants.Metrics.Tag.RUN_ID, runId);
    Map<String, String> writerContext = new HashMap<>(workflowMetricsContext);
    writerContext.put(Constants.Metrics.Tag.NODE, WorkflowAppWithLocalDatasets.LocalDatasetWriter.class.getSimpleName());
    Assert.assertEquals(2, getMetricsManager().getTotalMetric(writerContext, "user.num.lines"));
    Map<String, String> wfSparkMetricsContext = new HashMap<>(workflowMetricsContext);
    wfSparkMetricsContext.put(Constants.Metrics.Tag.NODE, "JavaSparkCSVToSpaceConverter");
    Assert.assertEquals(2, getMetricsManager().getTotalMetric(wfSparkMetricsContext, "user.num.lines"));
    // check in spark context
    Map<String, String> sparkMetricsContext = new HashMap<>();
    sparkMetricsContext.put(Constants.Metrics.Tag.NAMESPACE, testSpace.getNamespace());
    sparkMetricsContext.put(Constants.Metrics.Tag.APP, applicationManager.getInfo().getName());
    sparkMetricsContext.put(Constants.Metrics.Tag.SPARK, "JavaSparkCSVToSpaceConverter");
    sparkMetricsContext.put(Constants.Metrics.Tag.RUN_ID, nodeStateDetailMap.get("JavaSparkCSVToSpaceConverter").getRunId());
    Assert.assertEquals(2, getMetricsManager().getTotalMetric(sparkMetricsContext, "user.num.lines"));
    Map<String, String> appMetricsContext = new HashMap<>();
    appMetricsContext.put(Constants.Metrics.Tag.NAMESPACE, testSpace.getNamespace());
    appMetricsContext.put(Constants.Metrics.Tag.APP, applicationManager.getInfo().getName());
    // app metrics context should have sum from custom action and spark metrics.
    Assert.assertEquals(4, getMetricsManager().getTotalMetric(appMetricsContext, "user.num.lines"));
    Map<String, String> wfMRMetricsContext = new HashMap<>(workflowMetricsContext);
    wfMRMetricsContext.put(Constants.Metrics.Tag.NODE, "WordCount");
    Assert.assertEquals(7, getMetricsManager().getTotalMetric(wfMRMetricsContext, "user.num.words"));
    // mr metrics context
    Map<String, String> mrMetricsContext = new HashMap<>();
    mrMetricsContext.put(Constants.Metrics.Tag.NAMESPACE, testSpace.getNamespace());
    mrMetricsContext.put(Constants.Metrics.Tag.APP, applicationManager.getInfo().getName());
    mrMetricsContext.put(Constants.Metrics.Tag.MAPREDUCE, "WordCount");
    mrMetricsContext.put(Constants.Metrics.Tag.RUN_ID, nodeStateDetailMap.get("WordCount").getRunId());
    Assert.assertEquals(7, getMetricsManager().getTotalMetric(mrMetricsContext, "user.num.words"));
    final Map<String, String> readerContext = new HashMap<>(workflowMetricsContext);
    readerContext.put(Constants.Metrics.Tag.NODE, "readerAction");
    Tasks.waitFor(6L, new Callable<Long>() {

        @Override
        public Long call() throws Exception {
            return getMetricsManager().getTotalMetric(readerContext, "user.unique.words");
        }
    }, 60, TimeUnit.SECONDS);
    return runId;
}

Also used : FileSet(co.cask.cdap.api.dataset.lib.FileSet) HashMap(java.util.HashMap) WorkflowManager(co.cask.cdap.test.WorkflowManager) ConflictException(co.cask.cdap.common.ConflictException) IOException(java.io.IOException) TimeoutException(java.util.concurrent.TimeoutException) WorkflowNodeStateDetail(co.cask.cdap.proto.WorkflowNodeStateDetail) RunRecord(co.cask.cdap.proto.RunRecord) KeyValueTable(co.cask.cdap.api.dataset.lib.KeyValueTable) File(java.io.File)

Example 13 with FileSet

use of co.cask.cdap.api.dataset.lib.FileSet in project cdap by caskdata.

the class TestFrameworkTestRun method testClusterName.

@Test
public void testClusterName() throws Exception {
    String clusterName = getConfiguration().get(Constants.CLUSTER_NAME);
    ApplicationManager appManager = deployApplication(ClusterNameTestApp.class);
    final DataSetManager<KeyValueTable> datasetManager = getDataset(ClusterNameTestApp.CLUSTER_NAME_TABLE);
    final KeyValueTable clusterNameTable = datasetManager.get();
    // A callable for reading the cluster name from the ClusterNameTable.
    // It is used for Tasks.waitFor call down below.
    final AtomicReference<String> key = new AtomicReference<>();
    Callable<String> readClusterName = new Callable<String>() {

        @Nullable
        @Override
        public String call() throws Exception {
            datasetManager.flush();
            byte[] bytes = clusterNameTable.read(key.get());
            return bytes == null ? null : new String(bytes, StandardCharsets.UTF_8);
        }
    };
    // Service
    ServiceManager serviceManager = appManager.getServiceManager(ClusterNameTestApp.ClusterNameServiceHandler.class.getSimpleName()).start();
    Assert.assertEquals(clusterName, callServiceGet(serviceManager.getServiceURL(10, TimeUnit.SECONDS), "clusterName"));
    serviceManager.stop();
    // Worker
    WorkerManager workerManager = appManager.getWorkerManager(ClusterNameTestApp.ClusterNameWorker.class.getSimpleName()).start();
    key.set("worker.cluster.name");
    Tasks.waitFor(clusterName, readClusterName, 10, TimeUnit.SECONDS, 100, TimeUnit.MILLISECONDS);
    // The worker will stop by itself. No need to call stop
    workerManager.waitForRun(ProgramRunStatus.COMPLETED, 10, TimeUnit.SECONDS);
    // Flow
    FlowManager flowManager = appManager.getFlowManager(ClusterNameTestApp.ClusterNameFlow.class.getSimpleName()).start();
    key.set("flow.cluster.name");
    Tasks.waitFor(clusterName, readClusterName, 10, TimeUnit.SECONDS, 100, TimeUnit.MILLISECONDS);
    flowManager.stop();
    // MapReduce
    // Setup the input file used by MR
    Location location = this.<FileSet>getDataset(ClusterNameTestApp.INPUT_FILE_SET).get().getLocation("input");
    try (PrintStream printer = new PrintStream(location.getOutputStream(), true, "UTF-8")) {
        for (int i = 0; i < 10; i++) {
            printer.println("Hello World " + i);
        }
    }
    // Setup input and output dataset arguments
    Map<String, String> inputArgs = new HashMap<>();
    FileSetArguments.setInputPath(inputArgs, "input");
    Map<String, String> outputArgs = new HashMap<>();
    FileSetArguments.setOutputPath(outputArgs, "output");
    Map<String, String> args = new HashMap<>();
    args.putAll(RuntimeArguments.addScope(Scope.DATASET, ClusterNameTestApp.INPUT_FILE_SET, inputArgs));
    args.putAll(RuntimeArguments.addScope(Scope.DATASET, ClusterNameTestApp.OUTPUT_FILE_SET, outputArgs));
    MapReduceManager mrManager = appManager.getMapReduceManager(ClusterNameTestApp.ClusterNameMapReduce.class.getSimpleName()).start(args);
    key.set("mr.client.cluster.name");
    Tasks.waitFor(clusterName, readClusterName, 10, TimeUnit.SECONDS, 100, TimeUnit.MILLISECONDS);
    key.set("mapper.cluster.name");
    Tasks.waitFor(clusterName, readClusterName, 10, TimeUnit.SECONDS, 100, TimeUnit.MILLISECONDS);
    key.set("reducer.cluster.name");
    Tasks.waitFor(clusterName, readClusterName, 10, TimeUnit.SECONDS, 100, TimeUnit.MILLISECONDS);
    mrManager.waitForRun(ProgramRunStatus.COMPLETED, 60, TimeUnit.SECONDS);
    // Spark
    SparkManager sparkManager = appManager.getSparkManager(ClusterNameTestApp.ClusterNameSpark.class.getSimpleName()).start();
    key.set("spark.cluster.name");
    Tasks.waitFor(clusterName, readClusterName, 10, TimeUnit.SECONDS, 100, TimeUnit.MILLISECONDS);
    sparkManager.waitForRun(ProgramRunStatus.COMPLETED, 60, TimeUnit.SECONDS);
    // Workflow
    // Cleanup the output path for the MR job in the workflow first
    this.<FileSet>getDataset(ClusterNameTestApp.OUTPUT_FILE_SET).get().getLocation("output").delete(true);
    args = RuntimeArguments.addScope(Scope.MAPREDUCE, ClusterNameTestApp.ClusterNameMapReduce.class.getSimpleName(), args);
    WorkflowManager workflowManager = appManager.getWorkflowManager(ClusterNameTestApp.ClusterNameWorkflow.class.getSimpleName()).start(args);
    String prefix = ClusterNameTestApp.ClusterNameWorkflow.class.getSimpleName() + ".";
    key.set(prefix + "mr.client.cluster.name");
    Tasks.waitFor(clusterName, readClusterName, 10, TimeUnit.SECONDS, 100, TimeUnit.MILLISECONDS);
    key.set(prefix + "mapper.cluster.name");
    Tasks.waitFor(clusterName, readClusterName, 10, TimeUnit.SECONDS, 100, TimeUnit.MILLISECONDS);
    key.set(prefix + "reducer.cluster.name");
    Tasks.waitFor(clusterName, readClusterName, 10, TimeUnit.SECONDS, 100, TimeUnit.MILLISECONDS);
    key.set(prefix + "spark.cluster.name");
    Tasks.waitFor(clusterName, readClusterName, 10, TimeUnit.SECONDS, 100, TimeUnit.MILLISECONDS);
    key.set(prefix + "action.cluster.name");
    Tasks.waitFor(clusterName, readClusterName, 10, TimeUnit.SECONDS, 100, TimeUnit.MILLISECONDS);
    workflowManager.waitForRun(ProgramRunStatus.COMPLETED, 120, TimeUnit.SECONDS);
}

Also used : FlowManager(co.cask.cdap.test.FlowManager) PrintStream(java.io.PrintStream) ApplicationManager(co.cask.cdap.test.ApplicationManager) SparkManager(co.cask.cdap.test.SparkManager) FileSet(co.cask.cdap.api.dataset.lib.FileSet) MapReduceManager(co.cask.cdap.test.MapReduceManager) HashMap(java.util.HashMap) WorkflowManager(co.cask.cdap.test.WorkflowManager) AtomicReference(java.util.concurrent.atomic.AtomicReference) Callable(java.util.concurrent.Callable) WorkerManager(co.cask.cdap.test.WorkerManager) KeyValueTable(co.cask.cdap.api.dataset.lib.KeyValueTable) ServiceManager(co.cask.cdap.test.ServiceManager) Location(org.apache.twill.filesystem.Location) Test(org.junit.Test)

Example 14 with FileSet

use of co.cask.cdap.api.dataset.lib.FileSet in project cdap by caskdata.

the class SparkFileSetTestRun method testSparkWithFileSet.

private void testSparkWithFileSet(ApplicationManager applicationManager, String sparkProgram) throws Exception {
    DataSetManager<FileSet> filesetManager = getDataset("fs");
    final FileSet fileset = filesetManager.get();
    Location location = fileset.getLocation("nn");
    prepareFileInput(location);
    Map<String, String> inputArgs = new HashMap<>();
    FileSetArguments.setInputPath(inputArgs, "nn");
    Map<String, String> outputArgs = new HashMap<>();
    FileSetArguments.setOutputPath(inputArgs, "xx");
    Map<String, String> args = new HashMap<>();
    args.putAll(RuntimeArguments.addScope(Scope.DATASET, "fs", inputArgs));
    args.putAll(RuntimeArguments.addScope(Scope.DATASET, "fs", outputArgs));
    args.put("input", "fs");
    args.put("output", "fs");
    SparkManager sparkManager = applicationManager.getSparkManager(sparkProgram).start(args);
    sparkManager.waitForRun(ProgramRunStatus.COMPLETED, 1, TimeUnit.MINUTES);
    validateFileOutput(fileset.getLocation("xx"), "custom:");
    // Cleanup paths after running test
    fileset.getLocation("nn").delete(true);
    fileset.getLocation("xx").delete(true);
}

Also used : SparkManager(co.cask.cdap.test.SparkManager) TimePartitionedFileSet(co.cask.cdap.api.dataset.lib.TimePartitionedFileSet) SparkAppUsingFileSet(co.cask.cdap.spark.app.SparkAppUsingFileSet) FileSet(co.cask.cdap.api.dataset.lib.FileSet) PartitionedFileSet(co.cask.cdap.api.dataset.lib.PartitionedFileSet) HashMap(java.util.HashMap) Location(org.apache.twill.filesystem.Location)

Example 15 with FileSet

use of co.cask.cdap.api.dataset.lib.FileSet in project cdap by caskdata.

the class SparkTestRun method testSparkWithGetDataset.

private void testSparkWithGetDataset(Class<? extends Application> appClass, String sparkProgram) throws Exception {
    ApplicationManager applicationManager = deploy(appClass);
    DataSetManager<FileSet> filesetManager = getDataset("logs");
    FileSet fileset = filesetManager.get();
    Location location = fileset.getLocation("nn");
    prepareInputFileSetWithLogData(location);
    Map<String, String> inputArgs = new HashMap<>();
    FileSetArguments.setInputPath(inputArgs, "nn");
    Map<String, String> args = new HashMap<>();
    args.putAll(RuntimeArguments.addScope(Scope.DATASET, "logs", inputArgs));
    args.put("input", "logs");
    args.put("output", "logStats");
    SparkManager sparkManager = applicationManager.getSparkManager(sparkProgram).start(args);
    sparkManager.waitForRun(ProgramRunStatus.COMPLETED, 2, TimeUnit.MINUTES);
    DataSetManager<KeyValueTable> logStatsManager = getDataset("logStats");
    KeyValueTable logStatsTable = logStatsManager.get();
    validateGetDatasetOutput(logStatsTable);
    // Cleanup after run
    location.delete(true);
    logStatsManager.flush();
    try (CloseableIterator<KeyValue<byte[], byte[]>> scan = logStatsTable.scan(null, null)) {
        while (scan.hasNext()) {
            logStatsTable.delete(scan.next().getKey());
        }
    }
    logStatsManager.flush();
}

Also used : ApplicationManager(co.cask.cdap.test.ApplicationManager) SparkManager(co.cask.cdap.test.SparkManager) KeyValue(co.cask.cdap.api.dataset.lib.KeyValue) FileSet(co.cask.cdap.api.dataset.lib.FileSet) IdentityHashMap(java.util.IdentityHashMap) HashMap(java.util.HashMap) KeyValueTable(co.cask.cdap.api.dataset.lib.KeyValueTable) Location(org.apache.twill.filesystem.Location)

Aggregations

FileSet (co.cask.cdap.api.dataset.lib.FileSet)40 Location (org.apache.twill.filesystem.Location)28 Test (org.junit.Test)19 PartitionedFileSet (co.cask.cdap.api.dataset.lib.PartitionedFileSet)12 HashMap (java.util.HashMap)11 KeyValueTable (co.cask.cdap.api.dataset.lib.KeyValueTable)8 TimePartitionedFileSet (co.cask.cdap.api.dataset.lib.TimePartitionedFileSet)8 ApplicationManager (co.cask.cdap.test.ApplicationManager)8 DatasetId (co.cask.cdap.proto.id.DatasetId)7 ColumnDesc (co.cask.cdap.proto.ColumnDesc)6 QueryResult (co.cask.cdap.proto.QueryResult)6 SparkManager (co.cask.cdap.test.SparkManager)5 WorkflowManager (co.cask.cdap.test.WorkflowManager)5 Table (co.cask.cdap.api.dataset.table.Table)4 ServiceManager (co.cask.cdap.test.ServiceManager)4 IOException (java.io.IOException)4 Map (java.util.Map)4 StructuredRecord (co.cask.cdap.api.data.format.StructuredRecord)3 PartitionedFileSetProperties (co.cask.cdap.api.dataset.lib.PartitionedFileSetProperties)3 FileSetDataset (co.cask.cdap.data2.dataset2.lib.file.FileSetDataset)3