use of io.cdap.cdap.api.dataset.lib.TimePartitionedFileSet in project cdap by caskdata.
the class TimePartitionedFileSetTest method testPartitionMetadata.
@Test
public void testPartitionMetadata() throws Exception {
final TimePartitionedFileSet tpfs = dsFrameworkUtil.getInstance(TPFS_INSTANCE);
TransactionAware txAware = (TransactionAware) tpfs;
dsFrameworkUtil.newInMemoryTransactionExecutor(txAware).execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() throws Exception {
// make sure the dataset has no partitions
validateTimePartitions(tpfs, 0L, MAX, Collections.<Long, String>emptyMap());
Date date = DATE_FORMAT.parse("6/4/12 10:00 am");
long time = date.getTime();
// keep track of all the metadata added
Map<String, String> allMetadata = Maps.newHashMap();
Map<String, String> metadata = ImmutableMap.of("key1", "value1", "key2", "value3", "key100", "value4");
tpfs.addPartition(time, "file", metadata);
allMetadata.putAll(metadata);
TimePartitionDetail partitionByTime = tpfs.getPartitionByTime(time);
Assert.assertNotNull(partitionByTime);
Assert.assertEquals(metadata, partitionByTime.getMetadata().asMap());
tpfs.addMetadata(time, "key3", "value4");
allMetadata.put("key3", "value4");
// using the setMetadata API, adding an entry, for a key that already exists will overwrite the previous value
tpfs.setMetadata(time, Collections.singletonMap("key3", "value5"));
allMetadata.put("key3", "value5");
Map<String, String> newMetadata = ImmutableMap.of("key4", "value4", "key5", "value5");
tpfs.addMetadata(time, newMetadata);
allMetadata.putAll(newMetadata);
try {
// attempting to update an existing key throws a DatasetException
tpfs.addMetadata(time, "key3", "value5");
Assert.fail("Expected not to be able to update an existing metadata entry");
} catch (DataSetException expected) {
}
partitionByTime = tpfs.getPartitionByTime(time);
Assert.assertNotNull(partitionByTime);
Assert.assertEquals(allMetadata, partitionByTime.getMetadata().asMap());
// remove metadata entries; specifying metadata key that does not exist ('key6') does not cause an error
tpfs.removeMetadata(time, ImmutableSet.of("key4", "key5", "key6"));
allMetadata.remove("key4");
allMetadata.remove("key5");
partitionByTime = tpfs.getPartitionByTime(time);
Assert.assertNotNull(partitionByTime);
Assert.assertEquals(allMetadata, partitionByTime.getMetadata().asMap());
}
});
}
use of io.cdap.cdap.api.dataset.lib.TimePartitionedFileSet in project cdap by caskdata.
the class TimePartitionedFileSetTest method validateInputPaths.
/**
* Validates that the output configuration of the tpfs, when instantiated with (time - start * minutes) as
* input start time and (time + end * minutes) as input end time, returns the expected list of paths.
*/
private void validateInputPaths(long time, long start, long end, final String... expected) throws IOException, DatasetManagementException, InterruptedException, TransactionFailureException, UnauthorizedException {
Map<String, String> arguments = Maps.newHashMap();
TimePartitionedFileSetArguments.setInputStartTime(arguments, time + start * MINUTE);
TimePartitionedFileSetArguments.setInputEndTime(arguments, time + end * MINUTE);
final TimePartitionedFileSet tpfs = dsFrameworkUtil.getInstance(TPFS_INSTANCE, arguments);
TransactionAware txAwareDataset = (TransactionAware) tpfs;
dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() throws Exception {
Map<String, String> inputConfig = tpfs.getInputFormatConfiguration();
String inputs = inputConfig.get(FileInputFormat.INPUT_DIR);
Assert.assertNotNull(inputs);
if (expected.length == 0) {
Assert.assertTrue(inputs.isEmpty());
return;
}
String[] inputPaths = inputs.split(",");
Assert.assertEquals(expected.length, inputPaths.length);
// order is not guaranteed.
Arrays.sort(expected);
Arrays.sort(inputPaths);
for (int i = 0; i < expected.length; i++) {
// every input path is absolute, whereas expected paths are relative
Assert.assertTrue("path #" + i + " does not match", inputPaths[i].endsWith(expected[i]));
}
}
});
}
use of io.cdap.cdap.api.dataset.lib.TimePartitionedFileSet in project cdap by caskdata.
the class TimePartitionedFileSetTest method testTimePartitionedInputArguments.
@Test
public void testTimePartitionedInputArguments() throws Exception {
final long time8 = DATE_FORMAT.parse("10/17/2014 8:42 am").getTime();
final long time9 = DATE_FORMAT.parse("10/17/2014 9:42 am").getTime();
final String path8 = "8:42";
final String path9 = "9:42";
final PartitionFilter filter9 = PartitionFilter.builder().addRangeCondition("hour", 9, null).build();
// add a few partitions
{
final TimePartitionedFileSet dataset = dsFrameworkUtil.getInstance(TPFS_INSTANCE);
final TransactionAware txAwareDataset = (TransactionAware) dataset;
dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() throws Exception {
dataset.addPartition(time8, path8);
dataset.addPartition(time9, path9);
}
});
}
// test specifying time range for input
Map<String, String> arguments = Maps.newHashMap();
TimePartitionedFileSetArguments.setInputStartTime(arguments, time8 - 30 * MINUTE);
TimePartitionedFileSetArguments.setInputEndTime(arguments, time8 + 30 * MINUTE);
testInputConfiguration(arguments, path8);
// add a partition filter. it should not have an effect as long as there is a time range
TimePartitionedFileSetArguments.setInputPartitionFilter(arguments, filter9);
testInputConfiguration(arguments, path8);
// test specifying input with a partition filter
arguments.clear();
TimePartitionedFileSetArguments.setInputPartitionFilter(arguments, filter9);
testInputConfiguration(arguments, path9);
// test specifying only a start time or only an end time for input, or none
arguments.clear();
TimePartitionedFileSetArguments.setInputStartTime(arguments, time8 + 30 * MINUTE);
testInputConfigurationFailure(arguments, " with only a start time");
arguments.clear();
TimePartitionedFileSetArguments.setInputEndTime(arguments, time8 + 30 * MINUTE);
testInputConfigurationFailure(arguments, " with only an end time");
}
use of io.cdap.cdap.api.dataset.lib.TimePartitionedFileSet in project cdap by caskdata.
the class PartitionCorrectorTestRun method createPartition.
private void createPartition(DataSetManager<TimePartitionedFileSet> tpfsManager, long time, int i) throws Exception {
TimePartitionedFileSet tpfs = tpfsManager.get();
TimePartitionOutput output = tpfs.getPartitionOutput(time);
try (PrintStream out = new PrintStream(output.getLocation().append("file").getOutputStream())) {
out.println(String.format("%d,x%d", i, i));
}
output.addPartition();
tpfsManager.flush();
}
use of io.cdap.cdap.api.dataset.lib.TimePartitionedFileSet in project cdap by caskdata.
the class SparkFileSetTestRun method testSparkWithTimePartitionedFileSet.
private void testSparkWithTimePartitionedFileSet(ApplicationManager applicationManager, String sparkProgram) throws Exception {
long customOutputPartitionKey = 123456789L;
long customInputPartitionKey = 987654321L;
DataSetManager<TimePartitionedFileSet> tpfsManager = getDataset("tpfs");
long inputTime = System.currentTimeMillis();
long outputTime = inputTime + TimeUnit.HOURS.toMillis(1);
addTimePartition(tpfsManager, inputTime);
addTimePartition(tpfsManager, customInputPartitionKey);
Map<String, String> inputArgs = new HashMap<>();
TimePartitionedFileSetArguments.setInputStartTime(inputArgs, inputTime - 100);
TimePartitionedFileSetArguments.setInputEndTime(inputArgs, inputTime + 100);
Map<String, String> outputArgs = new HashMap<>();
TimePartitionedFileSetArguments.setOutputPartitionTime(outputArgs, outputTime);
Map<String, String> args = new HashMap<>();
args.putAll(RuntimeArguments.addScope(Scope.DATASET, "tpfs", inputArgs));
args.putAll(RuntimeArguments.addScope(Scope.DATASET, "tpfs", outputArgs));
args.put("input", "tpfs");
args.put("output", "tpfs");
args.put("outputKey", String.valueOf(customOutputPartitionKey));
args.put("inputKey", String.valueOf(customInputPartitionKey));
SparkManager sparkManager = applicationManager.getSparkManager(sparkProgram).start(args);
sparkManager.waitForRun(ProgramRunStatus.COMPLETED, 10, TimeUnit.MINUTES);
tpfsManager.flush();
TimePartitionedFileSet tpfs = tpfsManager.get();
PartitionDetail partition = tpfs.getPartitionByTime(outputTime);
Assert.assertNotNull("Output partition is null while for running without custom dataset arguments", partition);
validateFileOutput(partition.getLocation());
PartitionDetail customPartition = tpfs.getPartitionByTime(customOutputPartitionKey);
Assert.assertNotNull("Output partition is null while for running with custom dataset arguments", customPartition);
validateFileOutput(customPartition.getLocation());
// Cleanup after running the test
tpfs.dropPartition(inputTime);
tpfs.dropPartition(customInputPartitionKey);
tpfs.dropPartition(partition.getPartitionKey());
tpfs.dropPartition(customPartition.getPartitionKey());
tpfsManager.flush();
}
Aggregations