Examples with FileSet - co.cask.cdap.api.dataset.lib.FileSet

Example 36 with FileSet

use of co.cask.cdap.api.dataset.lib.FileSet in project cdap by caskdata.

the class ExploreTableManager method generateDisableStatement.

private String generateDisableStatement(DatasetId datasetId, DatasetSpecification spec) throws ExploreException {
    String tableName = tableNaming.getTableName(datasetId, spec.getProperties());
    String databaseName = ExploreProperties.getExploreDatabaseName(spec.getProperties());
    // If table does not exist, nothing to be done
    try {
        exploreService.getTableInfo(datasetId.getNamespace(), databaseName, tableName);
    } catch (TableNotFoundException e) {
        // Ignore exception, since this means table was not found.
        return null;
    }
    Dataset dataset = null;
    try (SystemDatasetInstantiator datasetInstantiator = datasetInstantiatorFactory.create()) {
        dataset = datasetInstantiator.getDataset(datasetId);
        if (dataset instanceof FileSet || dataset instanceof PartitionedFileSet) {
            // do not drop the explore table that dataset is reusing an existing table
            if (FileSetProperties.isUseExisting(spec.getProperties())) {
                return null;
            }
        }
        return generateDeleteStatement(dataset, databaseName, tableName);
    } catch (IOException e) {
        LOG.error("Exception creating dataset classLoaderProvider for dataset {}.", datasetId, e);
        throw new ExploreException("Exception instantiating dataset " + datasetId);
    } finally {
        Closeables.closeQuietly(dataset);
    }
}

Also used : FileSet(co.cask.cdap.api.dataset.lib.FileSet) PartitionedFileSet(co.cask.cdap.api.dataset.lib.PartitionedFileSet) Dataset(co.cask.cdap.api.dataset.Dataset) SystemDatasetInstantiator(co.cask.cdap.data.dataset.SystemDatasetInstantiator) PartitionedFileSet(co.cask.cdap.api.dataset.lib.PartitionedFileSet) IOException(java.io.IOException)

Example 37 with FileSet

use of co.cask.cdap.api.dataset.lib.FileSet in project cdap by caskdata.

the class SparkFileSetTestRun method testSparkWithCustomFileSet.

private void testSparkWithCustomFileSet(ApplicationManager applicationManager, String sparkProgram) throws Exception {
    final DataSetManager<SparkAppUsingFileSet.MyFileSet> myFileSetManager = getDataset("myfs");
    SparkAppUsingFileSet.MyFileSet myfileset = myFileSetManager.get();
    final FileSet fileset = myfileset.getEmbeddedFileSet();
    Location location = fileset.getLocation("nn");
    prepareFileInput(location);
    Map<String, String> inputArgs = new HashMap<>();
    FileSetArguments.setInputPath(inputArgs, "nn");
    Map<String, String> outputArgs = new HashMap<>();
    FileSetArguments.setOutputPath(inputArgs, "xx");
    Map<String, String> args = new HashMap<>();
    args.putAll(RuntimeArguments.addScope(Scope.DATASET, "myfs", inputArgs));
    args.putAll(RuntimeArguments.addScope(Scope.DATASET, "myfs", outputArgs));
    args.put("input", "myfs");
    args.put("output", "myfs");
    SparkManager sparkManager = applicationManager.getSparkManager(sparkProgram).start(args);
    sparkManager.waitForRun(ProgramRunStatus.COMPLETED, 2, TimeUnit.MINUTES);
    Assert.assertEquals(1, sparkManager.getHistory(ProgramRunStatus.COMPLETED).size());
    validateFileOutput(fileset.getLocation("xx"));
    // verify that onSuccess() was called and onFailure() was not
    Assert.assertTrue(myfileset.getSuccessLocation().exists());
    Assert.assertFalse(myfileset.getFailureLocation().exists());
    myfileset.getSuccessLocation().delete();
    // run the program again. It should fail due to existing output.
    sparkManager = applicationManager.getSparkManager(sparkProgram).start(args);
    sparkManager.waitForRun(ProgramRunStatus.FAILED, 2, TimeUnit.MINUTES);
    // Then we can verify that onFailure() was called.
    Assert.assertFalse(myfileset.getSuccessLocation().exists());
    Assert.assertTrue(myfileset.getFailureLocation().exists());
    // Cleanup the paths after running the Spark program
    fileset.getLocation("nn").delete(true);
    fileset.getLocation("xx").delete(true);
    myfileset.getSuccessLocation().delete(true);
    myfileset.getFailureLocation().delete(true);
}

Also used : SparkManager(co.cask.cdap.test.SparkManager) SparkAppUsingFileSet(co.cask.cdap.spark.app.SparkAppUsingFileSet) TimePartitionedFileSet(co.cask.cdap.api.dataset.lib.TimePartitionedFileSet) SparkAppUsingFileSet(co.cask.cdap.spark.app.SparkAppUsingFileSet) FileSet(co.cask.cdap.api.dataset.lib.FileSet) PartitionedFileSet(co.cask.cdap.api.dataset.lib.PartitionedFileSet) HashMap(java.util.HashMap) Location(org.apache.twill.filesystem.Location)

Example 38 with FileSet

use of co.cask.cdap.api.dataset.lib.FileSet in project cdap by caskdata.

the class SparkCSVToSpaceProgram method run.

@Override
public void run(final JavaSparkExecutionContext sec) throws Exception {
    JavaSparkContext jsc = new JavaSparkContext();
    Map<String, String> fileSetArgs = new HashMap<>();
    final Metrics metrics = sec.getMetrics();
    FileSetArguments.addInputPath(fileSetArgs, sec.getRuntimeArguments().get("input.path"));
    JavaPairRDD<LongWritable, Text> input = sec.fromDataset(WorkflowAppWithLocalDatasets.CSV_FILESET_DATASET, fileSetArgs);
    final List<String> converted = input.values().map(new Function<Text, String>() {

        @Override
        public String call(Text input) throws Exception {
            String line = input.toString();
            metrics.count("num.lines", 1);
            return line.replaceAll(",", " ");
        }
    }).collect();
    sec.execute(new TxRunnable() {

        @Override
        public void run(DatasetContext context) throws Exception {
            Map<String, String> args = sec.getRuntimeArguments();
            String outputPath = args.get("output.path");
            Map<String, String> fileSetArgs = new HashMap<>();
            FileSetArguments.setOutputPath(fileSetArgs, outputPath);
            FileSet fileSet = context.getDataset(WorkflowAppWithLocalDatasets.CSV_FILESET_DATASET, fileSetArgs);
            try (PrintWriter writer = new PrintWriter(fileSet.getOutputLocation().getOutputStream())) {
                for (String line : converted) {
                    writer.write(line);
                    writer.println();
                }
            }
        }
    });
}

Also used : FileSet(co.cask.cdap.api.dataset.lib.FileSet) HashMap(java.util.HashMap) Text(org.apache.hadoop.io.Text) Function(org.apache.spark.api.java.function.Function) Metrics(co.cask.cdap.api.metrics.Metrics) TxRunnable(co.cask.cdap.api.TxRunnable) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) LongWritable(org.apache.hadoop.io.LongWritable) DatasetContext(co.cask.cdap.api.data.DatasetContext) HashMap(java.util.HashMap) Map(java.util.Map) PrintWriter(java.io.PrintWriter)

Example 39 with FileSet

use of co.cask.cdap.api.dataset.lib.FileSet in project cdap by caskdata.

the class TestFrameworkTestRun method testCustomActionDatasetAccess.

@Category(SlowTests.class)
@Test
public void testCustomActionDatasetAccess() throws Exception {
    addDatasetInstance("keyValueTable", DatasetWithCustomActionApp.CUSTOM_TABLE);
    addDatasetInstance("fileSet", DatasetWithCustomActionApp.CUSTOM_FILESET);
    ApplicationManager appManager = deployApplication(DatasetWithCustomActionApp.class);
    ServiceManager serviceManager = appManager.getServiceManager(DatasetWithCustomActionApp.CUSTOM_SERVICE).start();
    serviceManager.waitForStatus(true);
    WorkflowManager workflowManager = appManager.getWorkflowManager(DatasetWithCustomActionApp.CUSTOM_WORKFLOW).start();
    workflowManager.waitForRun(ProgramRunStatus.COMPLETED, 2, TimeUnit.MINUTES);
    appManager.stopAll();
    DataSetManager<KeyValueTable> outTableManager = getDataset(DatasetWithCustomActionApp.CUSTOM_TABLE);
    KeyValueTable outputTable = outTableManager.get();
    Assert.assertEquals("world", Bytes.toString(outputTable.read("hello")));
    Assert.assertEquals("service", Bytes.toString(outputTable.read("hi")));
    Assert.assertEquals("another.world", Bytes.toString(outputTable.read("another.hello")));
    DataSetManager<FileSet> outFileSetManager = getDataset(DatasetWithCustomActionApp.CUSTOM_FILESET);
    FileSet fs = outFileSetManager.get();
    try (InputStream in = fs.getLocation("test").getInputStream()) {
        Assert.assertEquals(42, in.read());
    }
}

Also used : ApplicationManager(co.cask.cdap.test.ApplicationManager) FileSet(co.cask.cdap.api.dataset.lib.FileSet) ServiceManager(co.cask.cdap.test.ServiceManager) KeyValueTable(co.cask.cdap.api.dataset.lib.KeyValueTable) InputStream(java.io.InputStream) WorkflowManager(co.cask.cdap.test.WorkflowManager) Category(org.junit.experimental.categories.Category) Test(org.junit.Test)

Example 40 with FileSet

use of co.cask.cdap.api.dataset.lib.FileSet in project cdap by caskdata.

the class AdminAppTestRun method testAdminProgram.

private <T extends ProgramManager<T>> void testAdminProgram(ProgramManager<T> manager) throws Exception {
    // create fileset b; it will be updated by the worker
    addDatasetInstance(FileSet.class.getName(), "b", FileSetProperties.builder().setBasePath("some/path").setInputFormat(TextInputFormat.class).build());
    DataSetManager<FileSet> bManager = getDataset("b");
    String bFormat = bManager.get().getInputFormatClassName();
    String bPath = bManager.get().getBaseLocation().toURI().getPath();
    Assert.assertTrue(bPath.endsWith("some/path/"));
    bManager.flush();
    // create table c and write some data to it; it will be truncated by the worker
    addDatasetInstance("table", "c");
    DataSetManager<Table> cManager = getDataset("c");
    cManager.get().put(new Put("x", "y", "z"));
    cManager.flush();
    // create table d; it will be dropped by the worker
    addDatasetInstance("table", "d");
    // start the worker and wait for it to finish
    File newBasePath = new File(TMP_FOLDER.newFolder(), "extra");
    Assert.assertFalse(newBasePath.exists());
    manager.start(ImmutableMap.of("new.base.path", newBasePath.getPath()));
    manager.waitForRun(ProgramRunStatus.COMPLETED, 30, TimeUnit.SECONDS);
    // validate that worker created dataset a
    DataSetManager<Table> aManager = getDataset("a");
    Assert.assertNull(aManager.get().scan(null, null).next());
    aManager.flush();
    // validate that worker update fileset b, Get a new instance of b
    bManager = getDataset("b");
    Assert.assertEquals(bFormat, bManager.get().getInputFormatClassName());
    String newBPath = bManager.get().getBaseLocation().toURI().getPath();
    Assert.assertTrue(newBPath.endsWith("/extra/"));
    // make sure the directory was created by fileset update (by moving the existing base path)
    Assert.assertTrue(newBasePath.exists());
    bManager.flush();
    // validate that dataset c is empty
    Assert.assertNull(cManager.get().scan(null, null).next());
    cManager.flush();
    // validate that dataset d is gone
    Assert.assertNull(getDataset("d").get());
    // run the worker again to drop all datasets
    manager.start(ImmutableMap.of("dropAll", "true"));
    manager.waitForRuns(ProgramRunStatus.COMPLETED, 2, 30, TimeUnit.SECONDS);
    Assert.assertNull(getDataset("a").get());
    Assert.assertNull(getDataset("b").get());
    Assert.assertNull(getDataset("c").get());
    Assert.assertNull(getDataset("d").get());
}

Also used : Table(co.cask.cdap.api.dataset.table.Table) KeyValueTable(co.cask.cdap.api.dataset.lib.KeyValueTable) FileSet(co.cask.cdap.api.dataset.lib.FileSet) File(java.io.File) Put(co.cask.cdap.api.dataset.table.Put)

Aggregations

FileSet (co.cask.cdap.api.dataset.lib.FileSet)40 Location (org.apache.twill.filesystem.Location)28 Test (org.junit.Test)19 PartitionedFileSet (co.cask.cdap.api.dataset.lib.PartitionedFileSet)12 HashMap (java.util.HashMap)11 KeyValueTable (co.cask.cdap.api.dataset.lib.KeyValueTable)8 TimePartitionedFileSet (co.cask.cdap.api.dataset.lib.TimePartitionedFileSet)8 ApplicationManager (co.cask.cdap.test.ApplicationManager)8 DatasetId (co.cask.cdap.proto.id.DatasetId)7 ColumnDesc (co.cask.cdap.proto.ColumnDesc)6 QueryResult (co.cask.cdap.proto.QueryResult)6 SparkManager (co.cask.cdap.test.SparkManager)5 WorkflowManager (co.cask.cdap.test.WorkflowManager)5 Table (co.cask.cdap.api.dataset.table.Table)4 ServiceManager (co.cask.cdap.test.ServiceManager)4 IOException (java.io.IOException)4 Map (java.util.Map)4 StructuredRecord (co.cask.cdap.api.data.format.StructuredRecord)3 PartitionedFileSetProperties (co.cask.cdap.api.dataset.lib.PartitionedFileSetProperties)3 FileSetDataset (co.cask.cdap.data2.dataset2.lib.file.FileSetDataset)3