use of co.cask.cdap.api.dataset.lib.PartitionedFileSet in project cdap by caskdata.
the class DynamicPartitioningTestRun method testDynamicPartitioningMRWithFailure.
private void testDynamicPartitioningMRWithFailure(ApplicationManager appManager, String dsWithExistingPartition, String... outputs) throws Exception {
// set up the output datasets
String outputArg = "";
for (String dataset : outputs) {
outputArg += dataset + " ";
try {
deleteDatasetInstance(testSpace.dataset(dataset));
} catch (InstanceNotFoundException e) {
// may be expected. I wish the test framework had truncate()
}
addDatasetInstance(PartitionedFileSet.class.getName(), testSpace.dataset(dataset), PartitionedFileSetProperties.builder().setPartitioning(PARTITIONING).setEnableExploreOnCreate(true).setOutputFormat(org.apache.hadoop.mapreduce.lib.output.TextOutputFormat.class).setOutputProperty(org.apache.hadoop.mapreduce.lib.output.TextOutputFormat.SEPERATOR, ",").setExploreFormat("csv").setExploreSchema("key string, value string").build());
}
outputArg = outputArg.trim();
// create partition (x="1") in one of the outputs
DataSetManager<PartitionedFileSet> pfs = getDataset(testSpace.dataset(dsWithExistingPartition));
Location loc = pfs.get().getEmbeddedFileSet().getLocation("some/path");
OutputStream os = loc.append("part1").getOutputStream();
try (Writer writer = new OutputStreamWriter(os)) {
writer.write("1,x\n");
}
pfs.get().addPartition(PartitionKey.builder().addStringField("x", "1").build(), "some/path");
pfs.flush();
validatePartitions(dsWithExistingPartition, true);
Map<String, String> arguments = ImmutableMap.of("outputs", outputArg);
final MapReduceManager mrManager = appManager.getMapReduceManager("DynamicPartitioningMR");
final Set<RunRecord> oldRunRecords = new HashSet<>(mrManager.getHistory());
mrManager.start(arguments);
// Wait for the new run record to appear and finished running.
final AtomicReference<RunRecord> lastRunRecord = new AtomicReference<>();
Tasks.waitFor(true, new Callable<Boolean>() {
@Override
public Boolean call() throws Exception {
Set<RunRecord> runRecords = Sets.difference(new HashSet<>(mrManager.getHistory()), oldRunRecords);
if (runRecords.isEmpty()) {
return false;
}
// Get the last run record
RunRecord runRecord = Iterables.getFirst(runRecords, null);
if (runRecord != null && runRecord.getStatus() != ProgramRunStatus.STARTING && runRecord.getStatus() != ProgramRunStatus.RUNNING) {
lastRunRecord.set(runRecord);
}
return lastRunRecord.get() != null;
}
}, 5, TimeUnit.MINUTES, 1, TimeUnit.SECONDS);
for (String dataset : outputs) {
validatePartitions(dataset, dataset.equals(dsWithExistingPartition));
validateFiles(dataset, dataset.equals(dsWithExistingPartition) ? loc : null);
}
}
use of co.cask.cdap.api.dataset.lib.PartitionedFileSet in project cdap by caskdata.
the class PartitionConsumingTestRun method testWordCountOnFileSet.
private void testWordCountOnFileSet(Function<ApplicationManager, ProgramManager> runProgram, boolean produceOutputPartitionEachRun) throws Exception {
ApplicationManager applicationManager = deployApplication(AppWithPartitionConsumers.class);
ServiceManager serviceManager = applicationManager.getServiceManager("DatasetService").start();
serviceManager.waitForStatus(true);
URL serviceURL = serviceManager.getServiceURL();
// write a file to the file set using the service and run the WordCount MapReduce job on that one partition
createPartition(serviceURL, LINE1, "1");
ProgramManager programManager = runProgram.apply(applicationManager);
programManager.waitForRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
Assert.assertEquals(new Long(2), getCount(serviceURL, "a"));
Assert.assertEquals(new Long(1), getCount(serviceURL, "b"));
Assert.assertEquals(new Long(0), getCount(serviceURL, "c"));
// create two additional partitions
createPartition(serviceURL, LINE2, "2");
createPartition(serviceURL, LINE3, "3");
// running the program job now processes these two new partitions (LINE2 and LINE3) and updates the counts
// dataset accordingly
programManager = runProgram.apply(applicationManager);
programManager.waitForRuns(ProgramRunStatus.COMPLETED, 2, 5, TimeUnit.MINUTES);
Assert.assertEquals(new Long(3), getCount(serviceURL, "a"));
Assert.assertEquals(new Long(3), getCount(serviceURL, "b"));
Assert.assertEquals(new Long(3), getCount(serviceURL, "c"));
// running the program without adding new partitions does not affect the counts dataset
programManager = runProgram.apply(applicationManager);
programManager.waitForRuns(ProgramRunStatus.COMPLETED, 3, 5, TimeUnit.MINUTES);
Assert.assertEquals(new Long(3), getCount(serviceURL, "a"));
Assert.assertEquals(new Long(3), getCount(serviceURL, "b"));
Assert.assertEquals(new Long(3), getCount(serviceURL, "c"));
DataSetManager<PartitionedFileSet> outputLines = getDataset("outputLines");
Set<PartitionDetail> partitions = outputLines.get().getPartitions(PartitionFilter.ALWAYS_MATCH);
// each of the three MapReduce runs produces an output partition (even if there's no input data)
// however, Worker run doesn't produce a new output partition if there's no new input partition
Assert.assertEquals(produceOutputPartitionEachRun ? 3 : 2, partitions.size());
// we only store the counts to the "outputLines" dataset
List<String> expectedCounts = Lists.newArrayList("1", "1", "2", "2", "3");
List<String> outputRecords = getDataFromExplore("outputLines");
Collections.sort(outputRecords);
Assert.assertEquals(expectedCounts, outputRecords);
}
use of co.cask.cdap.api.dataset.lib.PartitionedFileSet in project cdap by caskdata.
the class PartitionBatchInput method setInput.
/**
* Used from the initialize method of the implementing batch job to configure as input a PartitionedFileSet that has
* specified a set of {@link Partition}s of a {@link PartitionedFileSet} to be processed by the run of the batch job.
* It does this by reading back the previous state, determining the new partitions to read, computing the new
* state, and persisting this new state. It then configures this dataset as input to the mapreduce context that is
* passed in.
*
* @param mapreduceContext MapReduce context used to access the PartitionedFileSet, and on which the input is
* configured
* @param partitionedFileSetName the name of the {@link PartitionedFileSet} to consume partitions from
* @param statePersistor a {@link DatasetStatePersistor} responsible for defining how the partition consumer state is
* managed
* @param consumerConfiguration defines parameters for the partition consumption
* @return a BatchPartitionCommitter used to persist the state of the partition consumer
*/
public static BatchPartitionCommitter setInput(MapReduceContext mapreduceContext, String partitionedFileSetName, DatasetStatePersistor statePersistor, ConsumerConfiguration consumerConfiguration) {
PartitionedFileSet partitionedFileSet = mapreduceContext.getDataset(partitionedFileSetName);
final PartitionConsumer partitionConsumer = new ConcurrentPartitionConsumer(partitionedFileSet, new DelegatingStatePersistor(mapreduceContext, statePersistor), consumerConfiguration);
final List<PartitionDetail> consumedPartitions = partitionConsumer.consumePartitions().getPartitions();
Map<String, String> arguments = new HashMap<>();
PartitionedFileSetArguments.addInputPartitions(arguments, consumedPartitions);
mapreduceContext.addInput(Input.ofDataset(partitionedFileSetName, arguments));
return succeeded -> partitionConsumer.onFinish(consumedPartitions, succeeded);
}
use of co.cask.cdap.api.dataset.lib.PartitionedFileSet in project cdap by caskdata.
the class MapReduceWithPartitionedTest method testPartitionedFileSetWithMR.
private void testPartitionedFileSetWithMR(boolean useCombineFileInputFormat) throws Exception {
ApplicationWithPrograms app = deployApp(AppWithPartitionedFileSet.class, new AppWithPartitionedFileSet.AppConfig(useCombineFileInputFormat));
// write a value to the input table
final Table table = datasetCache.getDataset(AppWithPartitionedFileSet.INPUT);
Transactions.createTransactionExecutor(txExecutorFactory, (TransactionAware) table).execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() {
table.put(Bytes.toBytes("x"), AppWithPartitionedFileSet.ONLY_COLUMN, Bytes.toBytes("1"));
}
});
// a partition key for the map/reduce output
final PartitionKey keyX = PartitionKey.builder().addStringField("type", "x").addLongField("time", 150000L).build();
// run the partition writer m/r with this output partition time
Map<String, String> runtimeArguments = Maps.newHashMap();
Map<String, String> outputArgs = Maps.newHashMap();
PartitionedFileSetArguments.setOutputPartitionKey(outputArgs, keyX);
runtimeArguments.putAll(RuntimeArguments.addScope(Scope.DATASET, PARTITIONED, outputArgs));
Assert.assertTrue(runProgram(app, AppWithPartitionedFileSet.PartitionWriter.class, new BasicArguments(runtimeArguments)));
// this should have created a partition in the tpfs
final PartitionedFileSet dataset = datasetCache.getDataset(PARTITIONED);
Transactions.createTransactionExecutor(txExecutorFactory, (TransactionAware) dataset).execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() {
Partition partition = dataset.getPartition(keyX);
Assert.assertNotNull(partition);
String path = partition.getRelativePath();
Assert.assertTrue(path.contains("x"));
Assert.assertTrue(path.contains("150000"));
}
});
// delete the data in the input table and write a new row
Transactions.createTransactionExecutor(txExecutorFactory, (TransactionAware) table).execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() {
table.delete(Bytes.toBytes("x"));
table.put(Bytes.toBytes("y"), AppWithPartitionedFileSet.ONLY_COLUMN, Bytes.toBytes("2"));
}
});
// a new partition key for the next map/reduce
final PartitionKey keyY = PartitionKey.builder().addStringField("type", "y").addLongField("time", 200000L).build();
// now run the m/r again with a new partition time, say 5 minutes later
PartitionedFileSetArguments.setOutputPartitionKey(outputArgs, keyY);
runtimeArguments.putAll(RuntimeArguments.addScope(Scope.DATASET, PARTITIONED, outputArgs));
Assert.assertTrue(runProgram(app, AppWithPartitionedFileSet.PartitionWriter.class, new BasicArguments(runtimeArguments)));
// this should have created a partition in the tpfs
Transactions.createTransactionExecutor(txExecutorFactory, (TransactionAware) dataset).execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() {
Partition partition = dataset.getPartition(keyY);
Assert.assertNotNull(partition);
String path = partition.getRelativePath();
Assert.assertNotNull(path);
Assert.assertTrue(path.contains("y"));
Assert.assertTrue(path.contains("200000"));
}
});
// a partition filter that matches the outputs of both map/reduces
PartitionFilter filterXY = PartitionFilter.builder().addRangeCondition("type", "x", "z").build();
// now run a map/reduce that reads all the partitions
runtimeArguments = Maps.newHashMap();
Map<String, String> inputArgs = Maps.newHashMap();
PartitionedFileSetArguments.setInputPartitionFilter(inputArgs, filterXY);
runtimeArguments.putAll(RuntimeArguments.addScope(Scope.DATASET, PARTITIONED, inputArgs));
runtimeArguments.put(AppWithPartitionedFileSet.ROW_TO_WRITE, "a");
Assert.assertTrue(runProgram(app, AppWithPartitionedFileSet.PartitionReader.class, new BasicArguments(runtimeArguments)));
// this should have read both partitions - and written both x and y to row a
final Table output = datasetCache.getDataset(AppWithPartitionedFileSet.OUTPUT);
Transactions.createTransactionExecutor(txExecutorFactory, (TransactionAware) output).execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() {
Row row = output.get(Bytes.toBytes("a"));
Assert.assertEquals("1", row.getString("x"));
Assert.assertEquals("{type=x, time=150000}", row.getString("x_key"));
Assert.assertEquals("2", row.getString("y"));
Assert.assertEquals("{type=y, time=200000}", row.getString("y_key"));
}
});
// a partition filter that matches the output key of the first map/reduce
PartitionFilter filterX = PartitionFilter.builder().addValueCondition("type", "x").addRangeCondition("time", null, 160000L).build();
// now run a map/reduce that reads a range of the partitions, namely the first one
inputArgs.clear();
PartitionedFileSetArguments.setInputPartitionFilter(inputArgs, filterX);
runtimeArguments.putAll(RuntimeArguments.addScope(Scope.DATASET, PARTITIONED, inputArgs));
runtimeArguments.put(AppWithPartitionedFileSet.ROW_TO_WRITE, "b");
Assert.assertTrue(runProgram(app, AppWithPartitionedFileSet.PartitionReader.class, new BasicArguments(runtimeArguments)));
// this should have read the first partition only - and written only x to row b
Transactions.createTransactionExecutor(txExecutorFactory, (TransactionAware) output).execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() {
Row row = output.get(Bytes.toBytes("b"));
Assert.assertEquals("1", row.getString("x"));
Assert.assertEquals("{type=x, time=150000}", row.getString("x_key"));
Assert.assertNull(row.get("y"));
Assert.assertNull(row.get("y_key"));
}
});
// a partition filter that matches no key
PartitionFilter filterMT = PartitionFilter.builder().addValueCondition("type", "nosuchthing").build();
// now run a map/reduce that reads an empty range of partitions (the filter matches nothing)
inputArgs.clear();
PartitionedFileSetArguments.setInputPartitionFilter(inputArgs, filterMT);
runtimeArguments.putAll(RuntimeArguments.addScope(Scope.DATASET, PARTITIONED, inputArgs));
runtimeArguments.put(AppWithPartitionedFileSet.ROW_TO_WRITE, "n");
Assert.assertTrue(runProgram(app, AppWithPartitionedFileSet.PartitionReader.class, new BasicArguments(runtimeArguments)));
// this should have read no partitions - and written nothing to row n
Transactions.createTransactionExecutor(txExecutorFactory, (TransactionAware) output).execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() {
Row row = output.get(Bytes.toBytes("n"));
Assert.assertTrue(row.isEmpty());
}
});
}
use of co.cask.cdap.api.dataset.lib.PartitionedFileSet in project cdap by caskdata.
the class SmartWorkflow method destroy.
@Override
public void destroy() {
WorkflowContext workflowContext = getContext();
PipelineRuntime pipelineRuntime = new PipelineRuntime(workflowContext, workflowMetrics);
// Execute the post actions only if pipeline is not running in preview mode.
if (!workflowContext.getDataTracer(PostAction.PLUGIN_TYPE).isEnabled()) {
for (Map.Entry<String, PostAction> endingActionEntry : postActions.entrySet()) {
String name = endingActionEntry.getKey();
PostAction action = endingActionEntry.getValue();
StageSpec stageSpec = stageSpecs.get(name);
BatchActionContext context = new WorkflowBackedActionContext(workflowContext, pipelineRuntime, stageSpec);
try {
action.run(context);
} catch (Throwable t) {
LOG.error("Error while running post action {}.", name, t);
}
}
}
// publish all alerts
for (Map.Entry<String, AlertPublisher> alertPublisherEntry : alertPublishers.entrySet()) {
String name = alertPublisherEntry.getKey();
AlertPublisher alertPublisher = alertPublisherEntry.getValue();
PartitionedFileSet alertConnector = workflowContext.getDataset(name);
try (CloseableIterator<Alert> alerts = new AlertReader(alertConnector.getPartitions(PartitionFilter.ALWAYS_MATCH))) {
if (!alerts.hasNext()) {
continue;
}
StageMetrics stageMetrics = new DefaultStageMetrics(workflowMetrics, name);
StageSpec stageSpec = stageSpecs.get(name);
AlertPublisherContext alertContext = new DefaultAlertPublisherContext(pipelineRuntime, stageSpec, workflowContext, workflowContext.getAdmin());
alertPublisher.initialize(alertContext);
TrackedIterator<Alert> trackedIterator = new TrackedIterator<>(alerts, stageMetrics, Constants.Metrics.RECORDS_IN);
alertPublisher.publish(trackedIterator);
} catch (Exception e) {
LOG.warn("Stage {} had errors publishing alerts. Alerts may not have been published.", name, e);
} finally {
try {
alertPublisher.destroy();
} catch (Exception e) {
LOG.warn("Error destroying alert publisher for stage {}", name, e);
}
}
}
ProgramStatus status = getContext().getState().getStatus();
if (status == ProgramStatus.FAILED) {
WRAPPERLOGGER.error("Pipeline '{}' failed.", getContext().getApplicationSpecification().getName());
} else {
WRAPPERLOGGER.info("Pipeline '{}' {}.", getContext().getApplicationSpecification().getName(), status == ProgramStatus.COMPLETED ? "succeeded" : status.name().toLowerCase());
}
MacroEvaluator macroEvaluator = new DefaultMacroEvaluator(pipelineRuntime.getArguments(), workflowContext.getLogicalStartTime(), workflowContext, workflowContext.getNamespace());
// Get resolved plugin properties
Map<String, Map<String, String>> resolvedProperties = new HashMap<>();
for (StageSpec spec : stageSpecs.values()) {
String stageName = spec.getName();
resolvedProperties.put(stageName, workflowContext.getPluginProperties(stageName, macroEvaluator).getProperties());
}
// Add resolved plugin properties to workflow token as a JSON String
workflowContext.getToken().put(RESOLVED_PLUGIN_PROPERTIES_MAP, GSON.toJson(resolvedProperties));
}
Aggregations