use of io.cdap.cdap.api.dataset.lib.Partition in project cdap by caskdata.
the class PartitionedFileSetTest method testPartitionConsumingWithFilterAndLimit.
@Test
public void testPartitionConsumingWithFilterAndLimit() throws Exception {
final PartitionedFileSet dataset = dsFrameworkUtil.getInstance(pfsInstance);
final TransactionAware txAwareDataset = (TransactionAware) dataset;
final Set<PartitionKey> partitionKeys1 = Sets.newHashSet();
for (int i = 0; i < 10; i++) {
partitionKeys1.add(generateUniqueKey());
}
final Set<PartitionKey> partitionKeys2 = Sets.newHashSet();
for (int i = 0; i < 15; i++) {
partitionKeys2.add(generateUniqueKey());
}
final SimplePartitionConsumer partitionConsumer = new SimplePartitionConsumer(dataset);
// (consumption only happens at transaction borders)
for (final PartitionKey partitionKey : partitionKeys1) {
dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() throws Exception {
dataset.getPartitionOutput(partitionKey).addPartition();
}
});
}
dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() throws Exception {
// Initial consumption results in the partitions corresponding to partitionKeys1 to be consumed because only
// those partitions are added to the dataset at this point
List<Partition> consumedPartitions = Lists.newArrayList();
// with limit = 1, the returned iterator is only size 1, even though there are more unconsumed partitions
Iterables.addAll(consumedPartitions, partitionConsumer.consumePartitions(1));
Assert.assertEquals(1, consumedPartitions.size());
// ask for 5 more
Iterables.addAll(consumedPartitions, partitionConsumer.consumePartitions(5));
Assert.assertEquals(6, consumedPartitions.size());
// ask for 5 more, but there are only 4 more unconsumed partitions (size of partitionKeys1 is 10).
Iterables.addAll(consumedPartitions, partitionConsumer.consumePartitions(5));
Assert.assertEquals(10, consumedPartitions.size());
Set<PartitionKey> retrievedKeys = Sets.newHashSet();
for (Partition consumedPartition : consumedPartitions) {
retrievedKeys.add(consumedPartition.getPartitionKey());
}
Assert.assertEquals(partitionKeys1, retrievedKeys);
}
});
dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() throws Exception {
for (PartitionKey partitionKey : partitionKeys2) {
dataset.getPartitionOutput(partitionKey).addPartition();
}
}
});
dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() throws Exception {
// using the same PartitionConsumer (which remembers the PartitionConsumerState) to consume additional
// partitions results in only the newly added partitions (corresponding to partitionKeys2) to be returned
List<Partition> consumedPartitions = Lists.newArrayList();
Iterables.addAll(consumedPartitions, partitionConsumer.consumePartitions(1));
// even though we set limit to 1 in the previous call to consumePartitions, we get all the elements of
// partitionKeys2, because they were all added in the same transaction
Set<PartitionKey> retrievedKeys = Sets.newHashSet();
for (Partition consumedPartition : consumedPartitions) {
retrievedKeys.add(consumedPartition.getPartitionKey());
}
Assert.assertEquals(partitionKeys2, retrievedKeys);
}
});
dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() throws Exception {
// consuming the partitions again, without adding any new partitions returns an empty iterator
Assert.assertTrue(partitionConsumer.consumePartitions().isEmpty());
}
});
dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() throws Exception {
// creating a new PartitionConsumer resets the consumption state.
// test combination of filter and limit
SimplePartitionConsumer newPartitionConsumer = new SimplePartitionConsumer(dataset);
List<Partition> consumedPartitions = Lists.newArrayList();
// the partitionFilter will match partitionKeys [1, 7), of which there are 6
final PartitionFilter partitionFilter = PartitionFilter.builder().addRangeCondition("i", 1, 7).build();
final Predicate<PartitionDetail> predicate = new Predicate<PartitionDetail>() {
@Override
public boolean apply(PartitionDetail partitionDetail) {
return partitionFilter.match(partitionDetail.getPartitionKey());
}
};
// apply the filter (narrows it down to 6 elements) and apply a limit of 4 results in 4 consumed partitions
Iterables.addAll(consumedPartitions, newPartitionConsumer.consumePartitions(4, predicate));
Assert.assertEquals(4, consumedPartitions.size());
// apply a limit of 3, using the same filter returns the remaining 2 elements that fit that filter
Iterables.addAll(consumedPartitions, newPartitionConsumer.consumePartitions(3, predicate));
Assert.assertEquals(6, consumedPartitions.size());
// assert that the partitions returned have partition keys, where the i values range from [1, 7]
Set<Integer> expectedIFields = new HashSet<>();
for (int i = 1; i < 7; i++) {
expectedIFields.add(i);
}
Set<Integer> actualIFields = new HashSet<>();
for (Partition consumedPartition : consumedPartitions) {
actualIFields.add((Integer) consumedPartition.getPartitionKey().getField("i"));
}
Assert.assertEquals(expectedIFields, actualIFields);
}
});
}
use of io.cdap.cdap.api.dataset.lib.Partition in project cdap by caskdata.
the class PartitionedFileSetArgumentsTest method testGetInputPartitionKeys.
@Test
public void testGetInputPartitionKeys() throws Exception {
Map<String, String> arguments = new HashMap<>();
Assert.assertEquals(0, PartitionedFileSetArguments.getInputPartitionKeys(arguments).size());
List<? extends Partition> partitions = Lists.newArrayList(new BasicPartition(null, "path/doesn't/matter/1", generateUniqueKey()), new BasicPartition(null, "path/doesn't/matter/2", generateUniqueKey()), new BasicPartition(null, "path/doesn't/matter/3", generateUniqueKey()));
for (Partition partition : partitions) {
PartitionedFileSetArguments.addInputPartition(arguments, partition);
}
List<PartitionKey> inputPartitionKeys = Lists.transform(partitions, new Function<Partition, PartitionKey>() {
@Nullable
@Override
public PartitionKey apply(Partition input) {
return input.getPartitionKey();
}
});
Assert.assertEquals(inputPartitionKeys, PartitionedFileSetArguments.getInputPartitionKeys(arguments));
arguments.clear();
PartitionedFileSetArguments.addInputPartitions(arguments, partitions.iterator());
Assert.assertEquals(inputPartitionKeys, PartitionedFileSetArguments.getInputPartitionKeys(arguments));
}
use of io.cdap.cdap.api.dataset.lib.Partition in project cdap by cdapio.
the class PartitionedFileSetArgumentsTest method testGetInputPartitionKeys.
@Test
public void testGetInputPartitionKeys() throws Exception {
Map<String, String> arguments = new HashMap<>();
Assert.assertEquals(0, PartitionedFileSetArguments.getInputPartitionKeys(arguments).size());
List<? extends Partition> partitions = Lists.newArrayList(new BasicPartition(null, "path/doesn't/matter/1", generateUniqueKey()), new BasicPartition(null, "path/doesn't/matter/2", generateUniqueKey()), new BasicPartition(null, "path/doesn't/matter/3", generateUniqueKey()));
for (Partition partition : partitions) {
PartitionedFileSetArguments.addInputPartition(arguments, partition);
}
List<PartitionKey> inputPartitionKeys = Lists.transform(partitions, new Function<Partition, PartitionKey>() {
@Nullable
@Override
public PartitionKey apply(Partition input) {
return input.getPartitionKey();
}
});
Assert.assertEquals(inputPartitionKeys, PartitionedFileSetArguments.getInputPartitionKeys(arguments));
arguments.clear();
PartitionedFileSetArguments.addInputPartitions(arguments, partitions.iterator());
Assert.assertEquals(inputPartitionKeys, PartitionedFileSetArguments.getInputPartitionKeys(arguments));
}
use of io.cdap.cdap.api.dataset.lib.Partition in project cdap by cdapio.
the class PartitionedFileSetTest method testSimplePartitionConsuming.
@Test
public void testSimplePartitionConsuming() throws Exception {
final PartitionedFileSet dataset = dsFrameworkUtil.getInstance(pfsInstance);
final TransactionAware txAwareDataset = (TransactionAware) dataset;
final Set<PartitionKey> partitionKeys1 = Sets.newHashSet();
for (int i = 0; i < 10; i++) {
partitionKeys1.add(generateUniqueKey());
}
final Set<PartitionKey> partitionKeys2 = Sets.newHashSet();
for (int i = 0; i < 15; i++) {
partitionKeys2.add(generateUniqueKey());
}
final SimplePartitionConsumer partitionConsumer = new SimplePartitionConsumer(dataset);
dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() throws Exception {
for (PartitionKey partitionKey : partitionKeys1) {
dataset.getPartitionOutput(partitionKey).addPartition();
}
}
});
dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() throws Exception {
// Initial consumption results in the partitions corresponding to partitionKeys1 to be consumed because only
// those partitions are added to the dataset at this point
List<Partition> consumedPartitions = Lists.newArrayList();
Iterables.addAll(consumedPartitions, partitionConsumer.consumePartitions());
Set<PartitionKey> retrievedKeys = Sets.newHashSet();
for (Partition consumedPartition : consumedPartitions) {
retrievedKeys.add(consumedPartition.getPartitionKey());
}
Assert.assertEquals(partitionKeys1, retrievedKeys);
}
});
dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() throws Exception {
for (PartitionKey partitionKey : partitionKeys2) {
dataset.getPartitionOutput(partitionKey).addPartition();
}
}
});
dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() throws Exception {
// using the same PartitionConsumer (which remembers the PartitionConsumerState) to consume additional
// partitions results in only the newly added partitions (corresponding to partitionKeys2) to be returned
List<Partition> consumedPartitions = Lists.newArrayList();
Iterables.addAll(consumedPartitions, partitionConsumer.consumePartitions());
Set<PartitionKey> retrievedKeys = Sets.newHashSet();
for (Partition consumedPartition : consumedPartitions) {
retrievedKeys.add(consumedPartition.getPartitionKey());
}
Assert.assertEquals(partitionKeys2, retrievedKeys);
}
});
dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() throws Exception {
// consuming the partitions again, without adding any new partitions returns an empty iterator
Assert.assertTrue(partitionConsumer.consumePartitions().isEmpty());
}
});
dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() throws Exception {
// creating a new PartitionConsumer resets the consumption state. Consuming from it then returns an iterator
// with all the partition keys
List<Partition> consumedPartitions = Lists.newArrayList();
Iterables.addAll(consumedPartitions, new SimplePartitionConsumer(dataset).consumePartitions());
Set<PartitionKey> retrievedKeys = Sets.newHashSet();
for (Partition consumedPartition : consumedPartitions) {
retrievedKeys.add(consumedPartition.getPartitionKey());
}
Set<PartitionKey> allKeys = Sets.newHashSet();
allKeys.addAll(partitionKeys1);
allKeys.addAll(partitionKeys2);
Assert.assertEquals(allKeys, retrievedKeys);
}
});
}
use of io.cdap.cdap.api.dataset.lib.Partition in project cdap by cdapio.
the class MapReduceWithPartitionedTest method testPartitionedFileSetWithMR.
private void testPartitionedFileSetWithMR(boolean useCombineFileInputFormat) throws Exception {
ApplicationWithPrograms app = deployApp(AppWithPartitionedFileSet.class, new AppWithPartitionedFileSet.AppConfig(useCombineFileInputFormat));
// write a value to the input table
final Table table = datasetCache.getDataset(AppWithPartitionedFileSet.INPUT);
Transactions.createTransactionExecutor(txExecutorFactory, (TransactionAware) table).execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() {
table.put(Bytes.toBytes("x"), AppWithPartitionedFileSet.ONLY_COLUMN, Bytes.toBytes("1"));
}
});
// a partition key for the map/reduce output
final PartitionKey keyX = PartitionKey.builder().addStringField("type", "x").addLongField("time", 150000L).build();
// run the partition writer m/r with this output partition time
Map<String, String> runtimeArguments = Maps.newHashMap();
Map<String, String> outputArgs = Maps.newHashMap();
PartitionedFileSetArguments.setOutputPartitionKey(outputArgs, keyX);
runtimeArguments.putAll(RuntimeArguments.addScope(Scope.DATASET, PARTITIONED, outputArgs));
Assert.assertTrue(runProgram(app, AppWithPartitionedFileSet.PartitionWriter.class, new BasicArguments(runtimeArguments)));
// this should have created a partition in the tpfs
final PartitionedFileSet dataset = datasetCache.getDataset(PARTITIONED);
Transactions.createTransactionExecutor(txExecutorFactory, (TransactionAware) dataset).execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() {
Partition partition = dataset.getPartition(keyX);
Assert.assertNotNull(partition);
String path = partition.getRelativePath();
Assert.assertTrue(path.contains("x"));
Assert.assertTrue(path.contains("150000"));
}
});
// delete the data in the input table and write a new row
Transactions.createTransactionExecutor(txExecutorFactory, (TransactionAware) table).execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() {
table.delete(Bytes.toBytes("x"));
table.put(Bytes.toBytes("y"), AppWithPartitionedFileSet.ONLY_COLUMN, Bytes.toBytes("2"));
}
});
// a new partition key for the next map/reduce
final PartitionKey keyY = PartitionKey.builder().addStringField("type", "y").addLongField("time", 200000L).build();
// now run the m/r again with a new partition time, say 5 minutes later
PartitionedFileSetArguments.setOutputPartitionKey(outputArgs, keyY);
runtimeArguments.putAll(RuntimeArguments.addScope(Scope.DATASET, PARTITIONED, outputArgs));
Assert.assertTrue(runProgram(app, AppWithPartitionedFileSet.PartitionWriter.class, new BasicArguments(runtimeArguments)));
// this should have created a partition in the tpfs
Transactions.createTransactionExecutor(txExecutorFactory, (TransactionAware) dataset).execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() {
Partition partition = dataset.getPartition(keyY);
Assert.assertNotNull(partition);
String path = partition.getRelativePath();
Assert.assertNotNull(path);
Assert.assertTrue(path.contains("y"));
Assert.assertTrue(path.contains("200000"));
}
});
// a partition filter that matches the outputs of both map/reduces
PartitionFilter filterXY = PartitionFilter.builder().addRangeCondition("type", "x", "z").build();
// now run a map/reduce that reads all the partitions
runtimeArguments = Maps.newHashMap();
Map<String, String> inputArgs = Maps.newHashMap();
PartitionedFileSetArguments.setInputPartitionFilter(inputArgs, filterXY);
runtimeArguments.putAll(RuntimeArguments.addScope(Scope.DATASET, PARTITIONED, inputArgs));
runtimeArguments.put(AppWithPartitionedFileSet.ROW_TO_WRITE, "a");
Assert.assertTrue(runProgram(app, AppWithPartitionedFileSet.PartitionReader.class, new BasicArguments(runtimeArguments)));
// this should have read both partitions - and written both x and y to row a
final Table output = datasetCache.getDataset(AppWithPartitionedFileSet.OUTPUT);
Transactions.createTransactionExecutor(txExecutorFactory, (TransactionAware) output).execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() {
Row row = output.get(Bytes.toBytes("a"));
Assert.assertEquals("1", row.getString("x"));
Assert.assertEquals("{type=x, time=150000}", row.getString("x_key"));
Assert.assertEquals("2", row.getString("y"));
Assert.assertEquals("{type=y, time=200000}", row.getString("y_key"));
}
});
// a partition filter that matches the output key of the first map/reduce
PartitionFilter filterX = PartitionFilter.builder().addValueCondition("type", "x").addRangeCondition("time", null, 160000L).build();
// now run a map/reduce that reads a range of the partitions, namely the first one
inputArgs.clear();
PartitionedFileSetArguments.setInputPartitionFilter(inputArgs, filterX);
runtimeArguments.putAll(RuntimeArguments.addScope(Scope.DATASET, PARTITIONED, inputArgs));
runtimeArguments.put(AppWithPartitionedFileSet.ROW_TO_WRITE, "b");
Assert.assertTrue(runProgram(app, AppWithPartitionedFileSet.PartitionReader.class, new BasicArguments(runtimeArguments)));
// this should have read the first partition only - and written only x to row b
Transactions.createTransactionExecutor(txExecutorFactory, (TransactionAware) output).execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() {
Row row = output.get(Bytes.toBytes("b"));
Assert.assertEquals("1", row.getString("x"));
Assert.assertEquals("{type=x, time=150000}", row.getString("x_key"));
Assert.assertNull(row.get("y"));
Assert.assertNull(row.get("y_key"));
}
});
// a partition filter that matches no key
PartitionFilter filterMT = PartitionFilter.builder().addValueCondition("type", "nosuchthing").build();
// now run a map/reduce that reads an empty range of partitions (the filter matches nothing)
inputArgs.clear();
PartitionedFileSetArguments.setInputPartitionFilter(inputArgs, filterMT);
runtimeArguments.putAll(RuntimeArguments.addScope(Scope.DATASET, PARTITIONED, inputArgs));
runtimeArguments.put(AppWithPartitionedFileSet.ROW_TO_WRITE, "n");
Assert.assertTrue(runProgram(app, AppWithPartitionedFileSet.PartitionReader.class, new BasicArguments(runtimeArguments)));
// this should have read no partitions - and written nothing to row n
Transactions.createTransactionExecutor(txExecutorFactory, (TransactionAware) output).execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() {
Row row = output.get(Bytes.toBytes("n"));
Assert.assertTrue(row.isEmpty());
}
});
}
Aggregations