use of io.cdap.cdap.api.dataset.lib.FileSet in project cdap by caskdata.
the class ExploreTableManager method generateDisableStatement.
private String generateDisableStatement(DatasetId datasetId, DatasetSpecification spec) throws ExploreException {
String tableName = tableNaming.getTableName(datasetId, spec.getProperties());
String databaseName = ExploreProperties.getExploreDatabaseName(spec.getProperties());
// If table does not exist, nothing to be done
try {
exploreService.getTableInfo(datasetId.getNamespace(), databaseName, tableName);
} catch (TableNotFoundException e) {
// Ignore exception, since this means table was not found.
return null;
}
try (SystemDatasetInstantiator datasetInstantiator = datasetInstantiatorFactory.create()) {
Dataset dataset = datasetInstantiator.getDataset(datasetId);
try {
if (dataset instanceof FileSet || dataset instanceof PartitionedFileSet) {
// do not drop the explore table that dataset is reusing an existing table
if (FileSetProperties.isUseExisting(spec.getProperties())) {
return null;
}
}
return generateDeleteStatement(dataset, databaseName, tableName);
} finally {
Closeables.closeQuietly(dataset);
}
} catch (IOException e) {
LOG.error("Exception creating dataset classLoaderProvider for dataset {}.", datasetId, e);
throw new ExploreException("Exception instantiating dataset " + datasetId);
}
}
use of io.cdap.cdap.api.dataset.lib.FileSet in project cdap by caskdata.
the class MapReduceProgramRunnerTest method testMapreduceWithFile.
private void testMapreduceWithFile(String inputDatasetName, String inputPaths, String outputDatasetName, String outputPath, Class appClass, Class mrClass, Map<String, String> extraRuntimeArgs, @Nullable final String counterTableName, @Nullable final String outputSeparator) throws Exception {
final ApplicationWithPrograms app = deployApp(appClass, new AppWithMapReduceUsingFileSet.AppConfig(inputDatasetName, outputDatasetName));
Map<String, String> runtimeArguments = Maps.newHashMap();
Map<String, String> inputArgs = Maps.newHashMap();
Map<String, String> outputArgs = Maps.newHashMap();
FileSetArguments.setInputPaths(inputArgs, inputPaths);
FileSetArguments.setOutputPath(outputArgs, outputPath);
if (outputSeparator != null) {
outputArgs.put(FileSetProperties.OUTPUT_PROPERTIES_PREFIX + TextOutputFormat.SEPERATOR, "#");
}
runtimeArguments.putAll(RuntimeArguments.addScope(Scope.DATASET, inputDatasetName, inputArgs));
runtimeArguments.putAll(RuntimeArguments.addScope(Scope.DATASET, outputDatasetName, outputArgs));
if (extraRuntimeArgs != null) {
runtimeArguments.putAll(extraRuntimeArgs);
}
// clear the counters in case a previous test case left behind some values
if (counterTableName != null) {
Transactions.execute(datasetCache.newTransactionContext(), "countersVerify", () -> {
KeyValueTable counters = datasetCache.getDataset(counterTableName);
counters.delete(AppWithMapReduceUsingRuntimeDatasets.INPUT_RECORDS);
counters.delete(AppWithMapReduceUsingRuntimeDatasets.REDUCE_KEYS);
});
}
// write a handful of numbers to a file; compute their sum, too.
final long[] values = { 15L, 17L, 7L, 3L };
final FileSet input = datasetCache.getDataset(inputDatasetName, inputArgs);
long sum = 0L, count = 1;
long inputRecords = 0;
for (Location inputLocation : input.getInputLocations()) {
final PrintWriter writer = new PrintWriter(inputLocation.getOutputStream());
for (long value : values) {
value *= count;
writer.println(value);
sum += value;
inputRecords++;
}
writer.close();
count++;
}
runProgram(app, mrClass, new BasicArguments(runtimeArguments));
// output location in file system is a directory that contains a part file, a _SUCCESS file, and checksums
// (.<filename>.crc) for these files. Find the actual part file. Its name begins with "part". In this case,
// there should be only one part file (with this small data, we have a single reducer).
final FileSet results = datasetCache.getDataset(outputDatasetName, outputArgs);
Location resultLocation = results.getOutputLocation();
if (resultLocation.isDirectory()) {
for (Location child : resultLocation.list()) {
if (!child.isDirectory() && child.getName().startsWith("part")) {
resultLocation = child;
break;
}
}
}
Assert.assertFalse(resultLocation.isDirectory());
// read output and verify result
String line = CharStreams.readFirstLine(CharStreams.newReaderSupplier(Locations.newInputSupplier(resultLocation), Charsets.UTF_8));
Assert.assertNotNull(line);
String[] fields = line.split(outputSeparator == null ? ":" : outputSeparator);
Assert.assertEquals(2, fields.length);
Assert.assertEquals(AppWithMapReduceUsingFileSet.FileMapper.ONLY_KEY, fields[0]);
Assert.assertEquals(sum, Long.parseLong(fields[1]));
if (counterTableName != null) {
final long totalInputRecords = inputRecords;
Transactions.execute(datasetCache.newTransactionContext(), "countersVerify", () -> {
KeyValueTable counters = datasetCache.getDataset(counterTableName);
Assert.assertEquals(totalInputRecords, counters.incrementAndGet(AppWithMapReduceUsingRuntimeDatasets.INPUT_RECORDS, 0L));
Assert.assertEquals(1L, counters.incrementAndGet(AppWithMapReduceUsingRuntimeDatasets.REDUCE_KEYS, 0L));
});
}
}
use of io.cdap.cdap.api.dataset.lib.FileSet in project cdap by caskdata.
the class MapReduceWithMultipleOutputsTest method testMultipleOutputs.
@Test
public void testMultipleOutputs() throws Exception {
ApplicationWithPrograms app = deployApp(AppWithMapReduceUsingMultipleOutputs.class);
final FileSet fileSet = datasetCache.getDataset(AppWithMapReduceUsingMultipleOutputs.PURCHASES);
Location inputFile = fileSet.getBaseLocation().append("inputFile");
inputFile.createNew();
PrintWriter writer = new PrintWriter(inputFile.getOutputStream());
// the PURCHASES dataset consists of purchase records in the format: <customerId> <spend>
writer.println("1 20");
writer.println("1 65");
writer.println("1 30");
writer.println("2 5");
writer.println("2 53");
writer.println("2 45");
writer.println("3 101");
writer.close();
// Using multiple outputs, this MapReduce send the records to a different path of the same dataset, depending
// on the value in the data (large spend amounts will go to one file, while small will go to another file.
runProgram(app, AppWithMapReduceUsingMultipleOutputs.SeparatePurchases.class, new BasicArguments());
FileSet outputFileSet = datasetCache.getDataset(AppWithMapReduceUsingMultipleOutputs.SEPARATED_PURCHASES);
Assert.assertEquals(ImmutableList.of("1 20", "1 30", "2 5", "2 45"), readFromOutput(outputFileSet, "small_purchases"));
Assert.assertEquals(ImmutableList.of("1 65", "2 53", "3 101"), readFromOutput(outputFileSet, "large_purchases"));
}
use of io.cdap.cdap.api.dataset.lib.FileSet in project cdap by caskdata.
the class ReportGenerationAppTest method createAndInitializeDataset.
private DatasetId createAndInitializeDataset(NamespaceId namespaceId, long currentTimeMillis) throws Exception {
DatasetId metaFileset = namespaceId.dataset(ReportGenerationApp.RUN_META_FILESET);
addDatasetInstance(metaFileset, FileSet.class.getName());
// TODO: [CDAP-13216] temporarily create the run meta fileset and generate mock program run meta files here.
// Will remove once the TMS subscriber writing to the run meta fileset is implemented.
DataSetManager<FileSet> fileSet = getDataset(metaFileset);
populateMetaFiles(fileSet.get().getBaseLocation(), currentTimeMillis);
return metaFileset;
}
use of io.cdap.cdap.api.dataset.lib.FileSet in project cdap by caskdata.
the class SmartWorkflow method destroy.
@Override
public void destroy() {
WorkflowContext workflowContext = getContext();
PipelineRuntime pipelineRuntime = new PipelineRuntime(workflowContext, workflowMetrics);
// Execute the post actions only if pipeline is not running in preview mode.
if (!workflowContext.getDataTracer(PostAction.PLUGIN_TYPE).isEnabled()) {
for (Map.Entry<String, PostAction> endingActionEntry : postActions.entrySet()) {
String name = endingActionEntry.getKey();
PostAction action = endingActionEntry.getValue();
StageSpec stageSpec = stageSpecs.get(name);
BatchActionContext context = new WorkflowBackedActionContext(workflowContext, pipelineRuntime, stageSpec);
try {
action.run(context);
} catch (Throwable t) {
LOG.error("Error while running post action {}.", name, t);
}
}
}
Map<String, String> connectorDatasets = GSON.fromJson(workflowContext.getWorkflowSpecification().getProperty(Constants.CONNECTOR_DATASETS), STAGE_DATASET_MAP);
// publish all alerts
for (Map.Entry<String, AlertPublisher> alertPublisherEntry : alertPublishers.entrySet()) {
String stageName = alertPublisherEntry.getKey();
AlertPublisher alertPublisher = alertPublisherEntry.getValue();
FileSet alertConnector = workflowContext.getDataset(connectorDatasets.get(stageName));
try (CloseableIterator<Alert> alerts = new AlertReader(alertConnector)) {
if (!alerts.hasNext()) {
continue;
}
StageMetrics stageMetrics = new DefaultStageMetrics(workflowMetrics, stageName);
StageSpec stageSpec = stageSpecs.get(stageName);
AlertPublisherContext alertContext = new DefaultAlertPublisherContext(pipelineRuntime, stageSpec, workflowContext, workflowContext.getAdmin());
alertPublisher.initialize(alertContext);
TrackedIterator<Alert> trackedIterator = new TrackedIterator<>(alerts, stageMetrics, Constants.Metrics.RECORDS_IN);
alertPublisher.publish(trackedIterator);
} catch (Exception e) {
LOG.warn("Stage {} had errors publishing alerts. Alerts may not have been published.", stageName, e);
} finally {
try {
alertPublisher.destroy();
} catch (Exception e) {
LOG.warn("Error destroying alert publisher for stage {}", stageName, e);
}
}
}
ProgramStatus status = getContext().getState().getStatus();
if (status == ProgramStatus.FAILED) {
WRAPPERLOGGER.error("Pipeline '{}' failed.", getContext().getApplicationSpecification().getName());
} else {
WRAPPERLOGGER.info("Pipeline '{}' {}.", getContext().getApplicationSpecification().getName(), status == ProgramStatus.COMPLETED ? "succeeded" : status.name().toLowerCase());
}
MacroEvaluator macroEvaluator = new DefaultMacroEvaluator(pipelineRuntime.getArguments(), workflowContext.getLogicalStartTime(), workflowContext, workflowContext, workflowContext.getNamespace());
// Get resolved plugin properties
Map<String, Map<String, String>> resolvedProperties = new HashMap<>();
for (StageSpec spec : stageSpecs.values()) {
String stageName = spec.getName();
resolvedProperties.put(stageName, workflowContext.getPluginProperties(stageName, macroEvaluator).getProperties());
}
// Add resolved plugin properties to workflow token as a JSON String
workflowContext.getToken().put(RESOLVED_PLUGIN_PROPERTIES_MAP, GSON.toJson(resolvedProperties));
// record only if the Workflow is successful
if (status != ProgramStatus.COMPLETED) {
return;
}
// Collect field operations from each phase
WorkflowToken token = workflowContext.getToken();
List<NodeValue> allNodeValues = token.getAll(Constants.FIELD_OPERATION_KEY_IN_WORKFLOW_TOKEN);
if (allNodeValues.isEmpty()) {
// no field lineage recorded by any stage
return;
}
Map<String, List<FieldOperation>> allStageOperations = new HashMap<>();
for (StageSpec stageSpec : stageSpecs.values()) {
allStageOperations.put(stageSpec.getName(), new ArrayList<>());
}
for (NodeValue nodeValue : allNodeValues) {
Map<String, List<FieldOperation>> stageOperations = GSON.fromJson(nodeValue.getValue().toString(), STAGE_OPERATIONS_MAP);
for (Map.Entry<String, List<FieldOperation>> entry : stageOperations.entrySet()) {
// ignore them
if (allStageOperations.containsKey(entry.getKey())) {
allStageOperations.get(entry.getKey()).addAll(entry.getValue());
}
}
}
FieldLineageProcessor processor = new FieldLineageProcessor(spec);
Set<Operation> processedOperations = processor.validateAndConvert(allStageOperations);
if (!processedOperations.isEmpty()) {
workflowContext.record(processedOperations);
}
}
Aggregations