Examples with FileSet - io.cdap.cdap.api.dataset.lib.FileSet

Example 16 with FileSet

use of io.cdap.cdap.api.dataset.lib.FileSet in project cdap by caskdata.

the class FileSetTest method testExternalAbsolutePath.

@Test
public void testExternalAbsolutePath() throws IOException, DatasetManagementException, UnauthorizedException {
    // create an external dir and create a file in it
    String absolutePath = tmpFolder.newFolder() + "/absolute/path";
    File absoluteFile = new File(absolutePath);
    absoluteFile.mkdirs();
    File someFile = new File(absoluteFile, "some.file");
    someFile.createNewFile();
    // create an external dataset
    dsFrameworkUtil.createInstance("fileSet", testFileSetInstance5, FileSetProperties.builder().setBasePath(absolutePath).setDataExternal(true).build());
    // instantiate the file set with an input and output path
    Map<String, String> fileArgs = Maps.newHashMap();
    FileSetArguments.setInputPath(fileArgs, "some.file");
    FileSetArguments.setOutputPath(fileArgs, "out");
    FileSet fileSet = dsFrameworkUtil.getInstance(testFileSetInstance5, fileArgs);
    Assert.assertNotNull(fileSet);
    // read the existing file
    Location input = fileSet.getInputLocations().iterator().next();
    InputStream in = input.getInputStream();
    in.close();
    // attempt to write an output file
    try {
        fileSet.getOutputLocation();
        Assert.fail("Extrernal file set should not allow writing output.");
    } catch (UnsupportedOperationException e) {
    // expected
    }
    // delete the dataset and validate that the files are still there
    dsFrameworkUtil.deleteInstance(testFileSetInstance5);
    Assert.assertTrue(someFile.exists());
}

Also used : FileSet(io.cdap.cdap.api.dataset.lib.FileSet) InputStream(java.io.InputStream) File(java.io.File) Location(org.apache.twill.filesystem.Location) Test(org.junit.Test)

Example 17 with FileSet

use of io.cdap.cdap.api.dataset.lib.FileSet in project cdap by caskdata.

the class FilesetDeletePostAction method run.

@Override
public void run(BatchActionContext context) throws Exception {
    if (!context.isSuccessful()) {
        return;
    }
    FileSet fileSet = context.getDataset(config.filesetName);
    Pattern pattern = Pattern.compile(config.deleteRegex);
    for (Location fileLocation : fileSet.getBaseLocation().append(config.directory).list()) {
        if (pattern.matcher(fileLocation.getName()).find()) {
            fileLocation.delete();
        }
    }
}

Also used : Pattern(java.util.regex.Pattern) FileSet(io.cdap.cdap.api.dataset.lib.FileSet) Location(org.apache.twill.filesystem.Location)

Example 18 with FileSet

use of io.cdap.cdap.api.dataset.lib.FileSet in project cdap by caskdata.

the class SparkCSVToSpaceProgram method run.

@Override
public void run(final JavaSparkExecutionContext sec) throws Exception {
    JavaSparkContext jsc = new JavaSparkContext();
    Map<String, String> fileSetArgs = new HashMap<>();
    final Metrics metrics = sec.getMetrics();
    FileSetArguments.addInputPath(fileSetArgs, sec.getRuntimeArguments().get("input.path"));
    JavaPairRDD<LongWritable, Text> input = sec.fromDataset(WorkflowAppWithLocalDatasets.CSV_FILESET_DATASET, fileSetArgs);
    final List<String> converted = input.values().map(new Function<Text, String>() {

        @Override
        public String call(Text input) throws Exception {
            String line = input.toString();
            metrics.count("num.lines", 1);
            return line.replaceAll(",", " ");
        }
    }).collect();
    sec.execute(new TxRunnable() {

        @Override
        public void run(DatasetContext context) throws Exception {
            Map<String, String> args = sec.getRuntimeArguments();
            String outputPath = args.get("output.path");
            Map<String, String> fileSetArgs = new HashMap<>();
            FileSetArguments.setOutputPath(fileSetArgs, outputPath);
            FileSet fileSet = context.getDataset(WorkflowAppWithLocalDatasets.CSV_FILESET_DATASET, fileSetArgs);
            try (PrintWriter writer = new PrintWriter(fileSet.getOutputLocation().getOutputStream())) {
                for (String line : converted) {
                    writer.write(line);
                    writer.println();
                }
            }
        }
    });
}

Also used : FileSet(io.cdap.cdap.api.dataset.lib.FileSet) HashMap(java.util.HashMap) Text(org.apache.hadoop.io.Text) Function(org.apache.spark.api.java.function.Function) Metrics(io.cdap.cdap.api.metrics.Metrics) TxRunnable(io.cdap.cdap.api.TxRunnable) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) LongWritable(org.apache.hadoop.io.LongWritable) DatasetContext(io.cdap.cdap.api.data.DatasetContext) HashMap(java.util.HashMap) Map(java.util.Map) PrintWriter(java.io.PrintWriter)

Example 19 with FileSet

use of io.cdap.cdap.api.dataset.lib.FileSet in project cdap by caskdata.

the class TestFrameworkTestRun method testAppWithPlugin.

@Test
public void testAppWithPlugin() throws Exception {
    ArtifactId artifactId = NamespaceId.DEFAULT.artifact("app-with-plugin", "1.0.0-SNAPSHOT");
    addAppArtifact(artifactId, AppWithPlugin.class);
    ArtifactId pluginArtifactId = NamespaceId.DEFAULT.artifact("test-plugin", "1.0.0-SNAPSHOT");
    addPluginArtifact(pluginArtifactId, artifactId, ToStringPlugin.class);
    ApplicationId appId = NamespaceId.DEFAULT.app("AppWithPlugin");
    AppRequest createRequest = new AppRequest(new ArtifactSummary(artifactId.getArtifact(), artifactId.getVersion()));
    ApplicationManager appManager = deployApplication(appId, createRequest);
    final WorkerManager workerManager = appManager.getWorkerManager(AppWithPlugin.WORKER);
    workerManager.start();
    workerManager.waitForRun(ProgramRunStatus.COMPLETED, 10, TimeUnit.SECONDS);
    final ServiceManager serviceManager = appManager.getServiceManager(AppWithPlugin.SERVICE);
    serviceManager.start();
    serviceManager.waitForRun(ProgramRunStatus.RUNNING, 10, TimeUnit.SECONDS);
    URL serviceURL = serviceManager.getServiceURL(5, TimeUnit.SECONDS);
    callServiceGet(serviceURL, "dummy");
    serviceManager.stop();
    serviceManager.waitForStopped(10, TimeUnit.SECONDS);
    WorkflowManager workflowManager = appManager.getWorkflowManager(AppWithPlugin.WORKFLOW);
    workflowManager.start();
    workflowManager.waitForRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
    List<RunRecord> runRecords = workflowManager.getHistory();
    Assert.assertNotEquals(ProgramRunStatus.FAILED, runRecords.get(0).getStatus());
    DataSetManager<KeyValueTable> workflowTableManager = getDataset(AppWithPlugin.WORKFLOW_TABLE);
    String value = Bytes.toString(workflowTableManager.get().read("val"));
    Assert.assertEquals(AppWithPlugin.TEST, value);
    Map<String, String> workflowTags = ImmutableMap.of(Constants.Metrics.Tag.NAMESPACE, NamespaceId.DEFAULT.getNamespace(), Constants.Metrics.Tag.APP, "AppWithPlugin", Constants.Metrics.Tag.WORKFLOW, AppWithPlugin.WORKFLOW, Constants.Metrics.Tag.RUN_ID, runRecords.get(0).getPid());
    getMetricsManager().waitForTotalMetricCount(workflowTags, String.format("user.destroy.%s", AppWithPlugin.WORKFLOW), 1, 60, TimeUnit.SECONDS);
    // Testing Spark Plugins. First send some data to fileset for the Spark program to process
    DataSetManager<FileSet> fileSetManager = getDataset(AppWithPlugin.SPARK_INPUT);
    FileSet fileSet = fileSetManager.get();
    try (PrintStream out = new PrintStream(fileSet.getLocation("input").append("file.txt").getOutputStream(), true, "UTF-8")) {
        for (int i = 0; i < 5; i++) {
            out.println("Message " + i);
        }
    }
    Map<String, String> sparkArgs = new HashMap<>();
    FileSetArguments.setInputPath(sparkArgs, "input");
    SparkManager sparkManager = appManager.getSparkManager(AppWithPlugin.SPARK).start(sparkArgs);
    sparkManager.waitForRun(ProgramRunStatus.COMPLETED, 2, TimeUnit.MINUTES);
    // Verify the Spark result.
    DataSetManager<Table> dataSetManager = getDataset(AppWithPlugin.SPARK_TABLE);
    Table table = dataSetManager.get();
    try (Scanner scanner = table.scan(null, null)) {
        for (int i = 0; i < 5; i++) {
            Row row = scanner.next();
            Assert.assertNotNull(row);
            String expected = "Message " + i + " " + AppWithPlugin.TEST;
            Assert.assertEquals(expected, Bytes.toString(row.getRow()));
            Assert.assertEquals(expected, Bytes.toString(row.get(expected)));
        }
        // There shouldn't be any more rows in the table.
        Assert.assertNull(scanner.next());
    }
}

Also used : Scanner(io.cdap.cdap.api.dataset.table.Scanner) ApplicationManager(io.cdap.cdap.test.ApplicationManager) ArtifactId(io.cdap.cdap.proto.id.ArtifactId) HashMap(java.util.HashMap) WorkflowManager(io.cdap.cdap.test.WorkflowManager) URL(java.net.URL) ServiceManager(io.cdap.cdap.test.ServiceManager) PrintStream(java.io.PrintStream) SparkManager(io.cdap.cdap.test.SparkManager) Table(io.cdap.cdap.api.dataset.table.Table) KeyValueTable(io.cdap.cdap.api.dataset.lib.KeyValueTable) FileSet(io.cdap.cdap.api.dataset.lib.FileSet) AppRequest(io.cdap.cdap.proto.artifact.AppRequest) WorkerManager(io.cdap.cdap.test.WorkerManager) RunRecord(io.cdap.cdap.proto.RunRecord) ArtifactSummary(io.cdap.cdap.api.artifact.ArtifactSummary) KeyValueTable(io.cdap.cdap.api.dataset.lib.KeyValueTable) Row(io.cdap.cdap.api.dataset.table.Row) ApplicationId(io.cdap.cdap.proto.id.ApplicationId) Test(org.junit.Test)

Example 20 with FileSet

use of io.cdap.cdap.api.dataset.lib.FileSet in project cdap by caskdata.

the class TestFrameworkTestRun method testClusterName.

@Test
public void testClusterName() throws Exception {
    String clusterName = getConfiguration().get(Constants.CLUSTER_NAME);
    ApplicationManager appManager = deployApplication(ClusterNameTestApp.class);
    final DataSetManager<KeyValueTable> datasetManager = getDataset(ClusterNameTestApp.CLUSTER_NAME_TABLE);
    final KeyValueTable clusterNameTable = datasetManager.get();
    // A callable for reading the cluster name from the ClusterNameTable.
    // It is used for Tasks.waitFor call down below.
    final AtomicReference<String> key = new AtomicReference<>();
    Callable<String> readClusterName = new Callable<String>() {

        @Nullable
        @Override
        public String call() throws Exception {
            datasetManager.flush();
            byte[] bytes = clusterNameTable.read(key.get());
            return bytes == null ? null : new String(bytes, StandardCharsets.UTF_8);
        }
    };
    // Service
    ServiceManager serviceManager = appManager.getServiceManager(ClusterNameTestApp.ClusterNameServiceHandler.class.getSimpleName()).start();
    Assert.assertEquals(clusterName, callServiceGet(serviceManager.getServiceURL(10, TimeUnit.SECONDS), "clusterName"));
    serviceManager.stop();
    // Worker
    WorkerManager workerManager = appManager.getWorkerManager(ClusterNameTestApp.ClusterNameWorker.class.getSimpleName()).start();
    key.set("worker.cluster.name");
    Tasks.waitFor(clusterName, readClusterName, 10, TimeUnit.SECONDS, 100, TimeUnit.MILLISECONDS);
    // The worker will stop by itself. No need to call stop
    workerManager.waitForRun(ProgramRunStatus.COMPLETED, 10, TimeUnit.SECONDS);
    // MapReduce
    // Setup the input file used by MR
    Location location = this.<FileSet>getDataset(ClusterNameTestApp.INPUT_FILE_SET).get().getLocation("input");
    try (PrintStream printer = new PrintStream(location.getOutputStream(), true, "UTF-8")) {
        for (int i = 0; i < 10; i++) {
            printer.println("Hello World " + i);
        }
    }
    // Setup input and output dataset arguments
    Map<String, String> inputArgs = new HashMap<>();
    FileSetArguments.setInputPath(inputArgs, "input");
    Map<String, String> outputArgs = new HashMap<>();
    FileSetArguments.setOutputPath(outputArgs, "output");
    Map<String, String> args = new HashMap<>();
    args.putAll(RuntimeArguments.addScope(Scope.DATASET, ClusterNameTestApp.INPUT_FILE_SET, inputArgs));
    args.putAll(RuntimeArguments.addScope(Scope.DATASET, ClusterNameTestApp.OUTPUT_FILE_SET, outputArgs));
    MapReduceManager mrManager = appManager.getMapReduceManager(ClusterNameTestApp.ClusterNameMapReduce.class.getSimpleName()).start(args);
    key.set("mr.client.cluster.name");
    Tasks.waitFor(clusterName, readClusterName, 10, TimeUnit.SECONDS, 100, TimeUnit.MILLISECONDS);
    key.set("mapper.cluster.name");
    Tasks.waitFor(clusterName, readClusterName, 10, TimeUnit.SECONDS, 100, TimeUnit.MILLISECONDS);
    key.set("reducer.cluster.name");
    Tasks.waitFor(clusterName, readClusterName, 10, TimeUnit.SECONDS, 100, TimeUnit.MILLISECONDS);
    mrManager.waitForRun(ProgramRunStatus.COMPLETED, 60, TimeUnit.SECONDS);
    // Spark
    SparkManager sparkManager = appManager.getSparkManager(ClusterNameTestApp.ClusterNameSpark.class.getSimpleName()).start();
    key.set("spark.cluster.name");
    Tasks.waitFor(clusterName, readClusterName, 10, TimeUnit.SECONDS, 100, TimeUnit.MILLISECONDS);
    sparkManager.waitForRun(ProgramRunStatus.COMPLETED, 60, TimeUnit.SECONDS);
    // Workflow
    // Cleanup the output path for the MR job in the workflow first
    this.<FileSet>getDataset(ClusterNameTestApp.OUTPUT_FILE_SET).get().getLocation("output").delete(true);
    args = RuntimeArguments.addScope(Scope.MAPREDUCE, ClusterNameTestApp.ClusterNameMapReduce.class.getSimpleName(), args);
    WorkflowManager workflowManager = appManager.getWorkflowManager(ClusterNameTestApp.ClusterNameWorkflow.class.getSimpleName()).start(args);
    String prefix = ClusterNameTestApp.ClusterNameWorkflow.class.getSimpleName() + ".";
    key.set(prefix + "mr.client.cluster.name");
    Tasks.waitFor(clusterName, readClusterName, 10, TimeUnit.SECONDS, 100, TimeUnit.MILLISECONDS);
    key.set(prefix + "mapper.cluster.name");
    Tasks.waitFor(clusterName, readClusterName, 10, TimeUnit.SECONDS, 100, TimeUnit.MILLISECONDS);
    key.set(prefix + "reducer.cluster.name");
    Tasks.waitFor(clusterName, readClusterName, 10, TimeUnit.SECONDS, 100, TimeUnit.MILLISECONDS);
    key.set(prefix + "spark.cluster.name");
    Tasks.waitFor(clusterName, readClusterName, 10, TimeUnit.SECONDS, 100, TimeUnit.MILLISECONDS);
    key.set(prefix + "action.cluster.name");
    Tasks.waitFor(clusterName, readClusterName, 10, TimeUnit.SECONDS, 100, TimeUnit.MILLISECONDS);
    workflowManager.waitForRun(ProgramRunStatus.COMPLETED, 120, TimeUnit.SECONDS);
}

Also used : PrintStream(java.io.PrintStream) ApplicationManager(io.cdap.cdap.test.ApplicationManager) SparkManager(io.cdap.cdap.test.SparkManager) FileSet(io.cdap.cdap.api.dataset.lib.FileSet) MapReduceManager(io.cdap.cdap.test.MapReduceManager) HashMap(java.util.HashMap) WorkflowManager(io.cdap.cdap.test.WorkflowManager) AtomicReference(java.util.concurrent.atomic.AtomicReference) Callable(java.util.concurrent.Callable) WorkerManager(io.cdap.cdap.test.WorkerManager) KeyValueTable(io.cdap.cdap.api.dataset.lib.KeyValueTable) ServiceManager(io.cdap.cdap.test.ServiceManager) Location(org.apache.twill.filesystem.Location) Test(org.junit.Test)

Aggregations

FileSet (io.cdap.cdap.api.dataset.lib.FileSet)90 Location (org.apache.twill.filesystem.Location)56 Test (org.junit.Test)44 PartitionedFileSet (io.cdap.cdap.api.dataset.lib.PartitionedFileSet)26 HashMap (java.util.HashMap)26 KeyValueTable (io.cdap.cdap.api.dataset.lib.KeyValueTable)24 DatasetId (io.cdap.cdap.proto.id.DatasetId)22 TimePartitionedFileSet (io.cdap.cdap.api.dataset.lib.TimePartitionedFileSet)16 ApplicationManager (io.cdap.cdap.test.ApplicationManager)16 Table (io.cdap.cdap.api.dataset.table.Table)14 WorkflowManager (io.cdap.cdap.test.WorkflowManager)14 ColumnDesc (io.cdap.cdap.proto.ColumnDesc)12 QueryResult (io.cdap.cdap.proto.QueryResult)12 SparkManager (io.cdap.cdap.test.SparkManager)12 File (java.io.File)10 StructuredRecord (io.cdap.cdap.api.data.format.StructuredRecord)8 ApplicationId (io.cdap.cdap.proto.id.ApplicationId)8 IOException (java.io.IOException)8 PrintStream (java.io.PrintStream)8 PrintWriter (java.io.PrintWriter)8