use of co.cask.cdap.api.dataset.lib.FileSet in project cdap by caskdata.
the class TextFileSetSource method onRunFinish.
// onRunFinish is called at the end of the pipeline run by the client that submitted the batch job.
@Override
public void onRunFinish(boolean succeeded, BatchSourceContext context) {
// in our case, we want to delete the data read during this run if the run succeeded.
if (succeeded && config.deleteInputOnSuccess) {
Map<String, String> arguments = new HashMap<>();
FileSetArguments.setInputPaths(arguments, config.files);
FileSet fileSet = context.getDataset(config.fileSetName, arguments);
for (Location inputLocation : fileSet.getInputLocations()) {
try {
inputLocation.delete(true);
} catch (IOException e) {
throw new RuntimeException(e);
}
}
}
}
use of co.cask.cdap.api.dataset.lib.FileSet in project cdap by caskdata.
the class ExploreTableManager method generateEnableStatement.
/**
* Generate a Hive DDL statement to create a Hive table for the given dataset.
*
* @param dataset the instantiated dataset
* @param spec the dataset specification
* @param datasetId the dataset id
* @param truncating whether this call to create() is part of a truncate() operation, which is in some
* case implemented using disableExplore() followed by enableExplore()
*
* @return a CREATE TABLE statement, or null if the dataset is not explorable
* @throws UnsupportedTypeException if the dataset is a RecordScannable of a type that is not supported by Hive
*/
@Nullable
private String generateEnableStatement(Dataset dataset, DatasetSpecification spec, DatasetId datasetId, String tableName, boolean truncating) throws UnsupportedTypeException, ExploreException {
String datasetName = datasetId.getDataset();
Map<String, String> serdeProperties = ImmutableMap.of(Constants.Explore.DATASET_NAME, datasetId.getDataset(), Constants.Explore.DATASET_NAMESPACE, datasetId.getNamespace());
// or it must be a FileSet or a PartitionedFileSet with explore enabled in it properties.
if (dataset instanceof Table) {
// valid for a table not to have a schema property. this logic should really be in Table
return generateCreateStatementFromSchemaProperty(spec, datasetId, tableName, serdeProperties, false);
}
if (dataset instanceof ObjectMappedTable) {
return generateCreateStatementFromSchemaProperty(spec, datasetId, tableName, serdeProperties, true);
}
boolean isRecordScannable = dataset instanceof RecordScannable;
boolean isRecordWritable = dataset instanceof RecordWritable;
if (isRecordScannable || isRecordWritable) {
Type recordType = isRecordScannable ? ((RecordScannable) dataset).getRecordType() : ((RecordWritable) dataset).getRecordType();
// Use == because that's what same class means.
if (StructuredRecord.class == recordType) {
return generateCreateStatementFromSchemaProperty(spec, datasetId, tableName, serdeProperties, true);
}
// otherwise, derive the schema from the record type
LOG.debug("Enabling explore for dataset instance {}", datasetName);
String databaseName = ExploreProperties.getExploreDatabaseName(spec.getProperties());
return new CreateStatementBuilder(datasetName, databaseName, tableName, shouldEscapeColumns).setSchema(hiveSchemaFor(recordType)).setTableComment("CDAP Dataset").buildWithStorageHandler(DatasetStorageHandler.class.getName(), serdeProperties);
} else if (dataset instanceof FileSet || dataset instanceof PartitionedFileSet) {
Map<String, String> properties = spec.getProperties();
if (FileSetProperties.isExploreEnabled(properties)) {
LOG.debug("Enabling explore for dataset instance {}", datasetName);
return generateFileSetCreateStatement(datasetId, dataset, properties, truncating);
}
}
// dataset is not explorable
return null;
}
use of co.cask.cdap.api.dataset.lib.FileSet in project cdap by caskdata.
the class ExploreTableManager method generateDisableStatement.
private String generateDisableStatement(DatasetId datasetId, DatasetSpecification spec) throws ExploreException {
String tableName = tableNaming.getTableName(datasetId, spec.getProperties());
String databaseName = ExploreProperties.getExploreDatabaseName(spec.getProperties());
// If table does not exist, nothing to be done
try {
exploreService.getTableInfo(datasetId.getNamespace(), databaseName, tableName);
} catch (TableNotFoundException e) {
// Ignore exception, since this means table was not found.
return null;
}
Dataset dataset = null;
try (SystemDatasetInstantiator datasetInstantiator = datasetInstantiatorFactory.create()) {
dataset = datasetInstantiator.getDataset(datasetId);
if (dataset instanceof FileSet || dataset instanceof PartitionedFileSet) {
// do not drop the explore table that dataset is reusing an existing table
if (FileSetProperties.isUseExisting(spec.getProperties())) {
return null;
}
}
return generateDeleteStatement(dataset, databaseName, tableName);
} catch (IOException e) {
LOG.error("Exception creating dataset classLoaderProvider for dataset {}.", datasetId, e);
throw new ExploreException("Exception instantiating dataset " + datasetId);
} finally {
Closeables.closeQuietly(dataset);
}
}
use of co.cask.cdap.api.dataset.lib.FileSet in project cdap by caskdata.
the class SparkFileSetTestRun method testSparkWithCustomFileSet.
private void testSparkWithCustomFileSet(ApplicationManager applicationManager, String sparkProgram) throws Exception {
final DataSetManager<SparkAppUsingFileSet.MyFileSet> myFileSetManager = getDataset("myfs");
SparkAppUsingFileSet.MyFileSet myfileset = myFileSetManager.get();
final FileSet fileset = myfileset.getEmbeddedFileSet();
Location location = fileset.getLocation("nn");
prepareFileInput(location);
Map<String, String> inputArgs = new HashMap<>();
FileSetArguments.setInputPath(inputArgs, "nn");
Map<String, String> outputArgs = new HashMap<>();
FileSetArguments.setOutputPath(inputArgs, "xx");
Map<String, String> args = new HashMap<>();
args.putAll(RuntimeArguments.addScope(Scope.DATASET, "myfs", inputArgs));
args.putAll(RuntimeArguments.addScope(Scope.DATASET, "myfs", outputArgs));
args.put("input", "myfs");
args.put("output", "myfs");
SparkManager sparkManager = applicationManager.getSparkManager(sparkProgram).start(args);
sparkManager.waitForRun(ProgramRunStatus.COMPLETED, 2, TimeUnit.MINUTES);
Assert.assertEquals(1, sparkManager.getHistory(ProgramRunStatus.COMPLETED).size());
validateFileOutput(fileset.getLocation("xx"));
// verify that onSuccess() was called and onFailure() was not
Assert.assertTrue(myfileset.getSuccessLocation().exists());
Assert.assertFalse(myfileset.getFailureLocation().exists());
myfileset.getSuccessLocation().delete();
// run the program again. It should fail due to existing output.
sparkManager = applicationManager.getSparkManager(sparkProgram).start(args);
sparkManager.waitForRun(ProgramRunStatus.FAILED, 2, TimeUnit.MINUTES);
// Then we can verify that onFailure() was called.
Assert.assertFalse(myfileset.getSuccessLocation().exists());
Assert.assertTrue(myfileset.getFailureLocation().exists());
// Cleanup the paths after running the Spark program
fileset.getLocation("nn").delete(true);
fileset.getLocation("xx").delete(true);
myfileset.getSuccessLocation().delete(true);
myfileset.getFailureLocation().delete(true);
}
use of co.cask.cdap.api.dataset.lib.FileSet in project cdap by caskdata.
the class SparkCSVToSpaceProgram method run.
@Override
public void run(final JavaSparkExecutionContext sec) throws Exception {
JavaSparkContext jsc = new JavaSparkContext();
Map<String, String> fileSetArgs = new HashMap<>();
final Metrics metrics = sec.getMetrics();
FileSetArguments.addInputPath(fileSetArgs, sec.getRuntimeArguments().get("input.path"));
JavaPairRDD<LongWritable, Text> input = sec.fromDataset(WorkflowAppWithLocalDatasets.CSV_FILESET_DATASET, fileSetArgs);
final List<String> converted = input.values().map(new Function<Text, String>() {
@Override
public String call(Text input) throws Exception {
String line = input.toString();
metrics.count("num.lines", 1);
return line.replaceAll(",", " ");
}
}).collect();
sec.execute(new TxRunnable() {
@Override
public void run(DatasetContext context) throws Exception {
Map<String, String> args = sec.getRuntimeArguments();
String outputPath = args.get("output.path");
Map<String, String> fileSetArgs = new HashMap<>();
FileSetArguments.setOutputPath(fileSetArgs, outputPath);
FileSet fileSet = context.getDataset(WorkflowAppWithLocalDatasets.CSV_FILESET_DATASET, fileSetArgs);
try (PrintWriter writer = new PrintWriter(fileSet.getOutputLocation().getOutputStream())) {
for (String line : converted) {
writer.write(line);
writer.println();
}
}
}
});
}
Aggregations