use of co.cask.cdap.api.dataset.lib.FileSet in project cdap by caskdata.
the class ExploreTableManager method generateDisableStatement.
private String generateDisableStatement(DatasetId datasetId, DatasetSpecification spec) throws ExploreException {
String tableName = tableNaming.getTableName(datasetId, spec.getProperties());
String databaseName = ExploreProperties.getExploreDatabaseName(spec.getProperties());
// If table does not exist, nothing to be done
try {
exploreService.getTableInfo(datasetId.getNamespace(), databaseName, tableName);
} catch (TableNotFoundException e) {
// Ignore exception, since this means table was not found.
return null;
}
Dataset dataset = null;
try (SystemDatasetInstantiator datasetInstantiator = datasetInstantiatorFactory.create()) {
dataset = datasetInstantiator.getDataset(datasetId);
if (dataset instanceof FileSet || dataset instanceof PartitionedFileSet) {
// do not drop the explore table that dataset is reusing an existing table
if (FileSetProperties.isUseExisting(spec.getProperties())) {
return null;
}
}
return generateDeleteStatement(dataset, databaseName, tableName);
} catch (IOException e) {
LOG.error("Exception creating dataset classLoaderProvider for dataset {}.", datasetId, e);
throw new ExploreException("Exception instantiating dataset " + datasetId);
} finally {
Closeables.closeQuietly(dataset);
}
}
use of co.cask.cdap.api.dataset.lib.FileSet in project cdap by caskdata.
the class SparkFileSetTestRun method testSparkWithCustomFileSet.
private void testSparkWithCustomFileSet(ApplicationManager applicationManager, String sparkProgram) throws Exception {
final DataSetManager<SparkAppUsingFileSet.MyFileSet> myFileSetManager = getDataset("myfs");
SparkAppUsingFileSet.MyFileSet myfileset = myFileSetManager.get();
final FileSet fileset = myfileset.getEmbeddedFileSet();
Location location = fileset.getLocation("nn");
prepareFileInput(location);
Map<String, String> inputArgs = new HashMap<>();
FileSetArguments.setInputPath(inputArgs, "nn");
Map<String, String> outputArgs = new HashMap<>();
FileSetArguments.setOutputPath(inputArgs, "xx");
Map<String, String> args = new HashMap<>();
args.putAll(RuntimeArguments.addScope(Scope.DATASET, "myfs", inputArgs));
args.putAll(RuntimeArguments.addScope(Scope.DATASET, "myfs", outputArgs));
args.put("input", "myfs");
args.put("output", "myfs");
SparkManager sparkManager = applicationManager.getSparkManager(sparkProgram).start(args);
sparkManager.waitForRun(ProgramRunStatus.COMPLETED, 2, TimeUnit.MINUTES);
Assert.assertEquals(1, sparkManager.getHistory(ProgramRunStatus.COMPLETED).size());
validateFileOutput(fileset.getLocation("xx"));
// verify that onSuccess() was called and onFailure() was not
Assert.assertTrue(myfileset.getSuccessLocation().exists());
Assert.assertFalse(myfileset.getFailureLocation().exists());
myfileset.getSuccessLocation().delete();
// run the program again. It should fail due to existing output.
sparkManager = applicationManager.getSparkManager(sparkProgram).start(args);
sparkManager.waitForRun(ProgramRunStatus.FAILED, 2, TimeUnit.MINUTES);
// Then we can verify that onFailure() was called.
Assert.assertFalse(myfileset.getSuccessLocation().exists());
Assert.assertTrue(myfileset.getFailureLocation().exists());
// Cleanup the paths after running the Spark program
fileset.getLocation("nn").delete(true);
fileset.getLocation("xx").delete(true);
myfileset.getSuccessLocation().delete(true);
myfileset.getFailureLocation().delete(true);
}
use of co.cask.cdap.api.dataset.lib.FileSet in project cdap by caskdata.
the class SparkCSVToSpaceProgram method run.
@Override
public void run(final JavaSparkExecutionContext sec) throws Exception {
JavaSparkContext jsc = new JavaSparkContext();
Map<String, String> fileSetArgs = new HashMap<>();
final Metrics metrics = sec.getMetrics();
FileSetArguments.addInputPath(fileSetArgs, sec.getRuntimeArguments().get("input.path"));
JavaPairRDD<LongWritable, Text> input = sec.fromDataset(WorkflowAppWithLocalDatasets.CSV_FILESET_DATASET, fileSetArgs);
final List<String> converted = input.values().map(new Function<Text, String>() {
@Override
public String call(Text input) throws Exception {
String line = input.toString();
metrics.count("num.lines", 1);
return line.replaceAll(",", " ");
}
}).collect();
sec.execute(new TxRunnable() {
@Override
public void run(DatasetContext context) throws Exception {
Map<String, String> args = sec.getRuntimeArguments();
String outputPath = args.get("output.path");
Map<String, String> fileSetArgs = new HashMap<>();
FileSetArguments.setOutputPath(fileSetArgs, outputPath);
FileSet fileSet = context.getDataset(WorkflowAppWithLocalDatasets.CSV_FILESET_DATASET, fileSetArgs);
try (PrintWriter writer = new PrintWriter(fileSet.getOutputLocation().getOutputStream())) {
for (String line : converted) {
writer.write(line);
writer.println();
}
}
}
});
}
use of co.cask.cdap.api.dataset.lib.FileSet in project cdap by caskdata.
the class TestFrameworkTestRun method testCustomActionDatasetAccess.
@Category(SlowTests.class)
@Test
public void testCustomActionDatasetAccess() throws Exception {
addDatasetInstance("keyValueTable", DatasetWithCustomActionApp.CUSTOM_TABLE);
addDatasetInstance("fileSet", DatasetWithCustomActionApp.CUSTOM_FILESET);
ApplicationManager appManager = deployApplication(DatasetWithCustomActionApp.class);
ServiceManager serviceManager = appManager.getServiceManager(DatasetWithCustomActionApp.CUSTOM_SERVICE).start();
serviceManager.waitForStatus(true);
WorkflowManager workflowManager = appManager.getWorkflowManager(DatasetWithCustomActionApp.CUSTOM_WORKFLOW).start();
workflowManager.waitForRun(ProgramRunStatus.COMPLETED, 2, TimeUnit.MINUTES);
appManager.stopAll();
DataSetManager<KeyValueTable> outTableManager = getDataset(DatasetWithCustomActionApp.CUSTOM_TABLE);
KeyValueTable outputTable = outTableManager.get();
Assert.assertEquals("world", Bytes.toString(outputTable.read("hello")));
Assert.assertEquals("service", Bytes.toString(outputTable.read("hi")));
Assert.assertEquals("another.world", Bytes.toString(outputTable.read("another.hello")));
DataSetManager<FileSet> outFileSetManager = getDataset(DatasetWithCustomActionApp.CUSTOM_FILESET);
FileSet fs = outFileSetManager.get();
try (InputStream in = fs.getLocation("test").getInputStream()) {
Assert.assertEquals(42, in.read());
}
}
use of co.cask.cdap.api.dataset.lib.FileSet in project cdap by caskdata.
the class AdminAppTestRun method testAdminProgram.
private <T extends ProgramManager<T>> void testAdminProgram(ProgramManager<T> manager) throws Exception {
// create fileset b; it will be updated by the worker
addDatasetInstance(FileSet.class.getName(), "b", FileSetProperties.builder().setBasePath("some/path").setInputFormat(TextInputFormat.class).build());
DataSetManager<FileSet> bManager = getDataset("b");
String bFormat = bManager.get().getInputFormatClassName();
String bPath = bManager.get().getBaseLocation().toURI().getPath();
Assert.assertTrue(bPath.endsWith("some/path/"));
bManager.flush();
// create table c and write some data to it; it will be truncated by the worker
addDatasetInstance("table", "c");
DataSetManager<Table> cManager = getDataset("c");
cManager.get().put(new Put("x", "y", "z"));
cManager.flush();
// create table d; it will be dropped by the worker
addDatasetInstance("table", "d");
// start the worker and wait for it to finish
File newBasePath = new File(TMP_FOLDER.newFolder(), "extra");
Assert.assertFalse(newBasePath.exists());
manager.start(ImmutableMap.of("new.base.path", newBasePath.getPath()));
manager.waitForRun(ProgramRunStatus.COMPLETED, 30, TimeUnit.SECONDS);
// validate that worker created dataset a
DataSetManager<Table> aManager = getDataset("a");
Assert.assertNull(aManager.get().scan(null, null).next());
aManager.flush();
// validate that worker update fileset b, Get a new instance of b
bManager = getDataset("b");
Assert.assertEquals(bFormat, bManager.get().getInputFormatClassName());
String newBPath = bManager.get().getBaseLocation().toURI().getPath();
Assert.assertTrue(newBPath.endsWith("/extra/"));
// make sure the directory was created by fileset update (by moving the existing base path)
Assert.assertTrue(newBasePath.exists());
bManager.flush();
// validate that dataset c is empty
Assert.assertNull(cManager.get().scan(null, null).next());
cManager.flush();
// validate that dataset d is gone
Assert.assertNull(getDataset("d").get());
// run the worker again to drop all datasets
manager.start(ImmutableMap.of("dropAll", "true"));
manager.waitForRuns(ProgramRunStatus.COMPLETED, 2, 30, TimeUnit.SECONDS);
Assert.assertNull(getDataset("a").get());
Assert.assertNull(getDataset("b").get());
Assert.assertNull(getDataset("c").get());
Assert.assertNull(getDataset("d").get());
}
Aggregations