use of co.cask.cdap.api.dataset.lib.PartitionedFileSet in project cdap by caskdata.
the class PartitionedFileSetTest method testPartitionMetadata.
@Test
public void testPartitionMetadata() throws Exception {
final PartitionedFileSet dataset = dsFrameworkUtil.getInstance(pfsInstance);
dsFrameworkUtil.newTransactionExecutor((TransactionAware) dataset).execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() throws Exception {
PartitionKey partitionKey = PartitionKey.builder().addIntField("i", 42).addLongField("l", 17L).addStringField("s", "x").build();
ImmutableMap<String, String> metadata = ImmutableMap.of("key1", "value", "key2", "value2", "key3", "value2");
PartitionOutput partitionOutput = dataset.getPartitionOutput(partitionKey);
partitionOutput.setMetadata(metadata);
partitionOutput.addPartition();
PartitionDetail partitionDetail = dataset.getPartition(partitionKey);
Assert.assertNotNull(partitionDetail);
Assert.assertEquals(metadata, partitionDetail.getMetadata().asMap());
}
});
}
use of co.cask.cdap.api.dataset.lib.PartitionedFileSet in project cdap by caskdata.
the class PartitionedFileSetTest method testPartitionConsumer.
@Test
public void testPartitionConsumer() throws Exception {
// exercises the edge case of partition consumption, when partitions are being consumed, while another in-progress
// transaction has added a partition, but it has not yet committed, so the partition is not available for the
// consumer
// note: each concurrent transaction needs its own instance of the dataset because the dataset holds the txId
// as an instance variable
PartitionedFileSet dataset1 = dsFrameworkUtil.getInstance(pfsInstance);
PartitionedFileSet dataset2 = dsFrameworkUtil.getInstance(pfsInstance);
PartitionedFileSet dataset3 = dsFrameworkUtil.getInstance(pfsInstance);
// producer simply adds initial partition
TransactionContext txContext1 = new TransactionContext(txClient, (TransactionAware) dataset1);
txContext1.start();
PartitionKey partitionKey1 = generateUniqueKey();
dataset1.getPartitionOutput(partitionKey1).addPartition();
txContext1.finish();
// consumer simply consumes initial partition
TransactionContext txContext2 = new TransactionContext(txClient, (TransactionAware) dataset2);
txContext2.start();
SimplePartitionConsumer partitionConsumer = new SimplePartitionConsumer(dataset2);
List<PartitionDetail> partitions = partitionConsumer.consumePartitions();
Assert.assertEquals(1, partitions.size());
Assert.assertEquals(partitionKey1, partitions.get(0).getPartitionKey());
txContext2.finish();
// producer adds a 2nd partition but does not yet commit the transaction
txContext1.start();
PartitionKey partitionKey2 = generateUniqueKey();
dataset1.getPartitionOutput(partitionKey2).addPartition();
// another producer adds a 3rd partition, but does not yet commit the transaction
TransactionContext txContext3 = new TransactionContext(txClient, (TransactionAware) dataset3);
txContext3.start();
PartitionKey partitionKey3 = generateUniqueKey();
dataset3.getPartitionOutput(partitionKey3).addPartition();
// simply start and commit a transaction so the next transaction's read pointer is higher than the previous
// transaction's write pointer. Otherwise, the previous transaction may not get included in the in-progress list
txContext2.start();
txContext2.finish();
// consumer attempts to consume at a time after the partition was added, but before it committed. Because of this,
// the partition is not visible and will not be consumed
txContext2.start();
Assert.assertTrue(partitionConsumer.consumePartitions().isEmpty());
txContext2.finish();
// both producers commit the transaction in which the second partition was added
txContext1.finish();
txContext3.finish();
// the next time the consumer runs, it processes the second partition
txContext2.start();
partitions = partitionConsumer.consumePartitions();
Assert.assertEquals(2, partitions.size());
// ordering may be different
Assert.assertEquals(ImmutableSet.of(partitionKey2, partitionKey3), ImmutableSet.of(partitions.get(0).getPartitionKey(), partitions.get(1).getPartitionKey()));
txContext2.finish();
}
use of co.cask.cdap.api.dataset.lib.PartitionedFileSet in project cdap by caskdata.
the class PartitionedFileSetTest method testRollbackOfPartitionCreateWhereItAlreadyExisted.
@Test
public void testRollbackOfPartitionCreateWhereItAlreadyExisted() throws Exception {
PartitionedFileSet pfs = dsFrameworkUtil.getInstance(pfsInstance);
TransactionContext txContext = new TransactionContext(txClient, (TransactionAware) pfs);
txContext.start();
Assert.assertNull(pfs.getPartition(PARTITION_KEY));
Location file1Location = createPartition(pfs, PARTITION_KEY, "file1");
Assert.assertNotNull(pfs.getPartition(PARTITION_KEY));
txContext.finish();
// the file should exist because the transaction completed successfully
Assert.assertTrue(file1Location.exists());
// if you attempt to add a partition X, and it already existed, the transaction rollback should not remove
// the files of the original, existing partition X.
txContext.start();
Assert.assertNotNull(pfs.getPartition(PARTITION_KEY));
try {
// PartitionedFileSet#getPartitionOutput should fail
createPartition(pfs, PARTITION_KEY, "file2");
Assert.fail("Expected PartitionAlreadyExistsException");
} catch (PartitionAlreadyExistsException expected) {
}
// because of the above failure, we want to abort and rollback the transaction
txContext.abort();
// the file should still exist because the aborted transaction should've failed before even needing to rollback
// the partition's files
Assert.assertTrue(file1Location.exists());
// file2 shouldn't exist
txContext.start();
Assert.assertFalse(pfs.getPartition(PARTITION_KEY).getLocation().append("file2").exists());
txContext.finish();
}
use of co.cask.cdap.api.dataset.lib.PartitionedFileSet in project cdap by caskdata.
the class PartitionedFileSetTest method testMetadataForNonexistentPartition.
@Test
public void testMetadataForNonexistentPartition() throws Exception {
PartitionedFileSet pfs = dsFrameworkUtil.getInstance(pfsInstance);
PartitionKey key = generateUniqueKey();
TransactionContext txContext = new TransactionContext(txClient, (TransactionAware) pfs);
txContext.start();
try {
// didn't add any partitions to the dataset, so any partition key should throw a PartitionNotFoundException
pfs.addMetadata(key, "metaKey", "metaValue");
Assert.fail("Expected not to find key: " + key);
} catch (PartitionNotFoundException e) {
Assert.assertEquals(pfsInstance.getEntityName(), e.getPartitionedFileSetName());
Assert.assertEquals(key, e.getPartitionKey());
} finally {
txContext.abort();
}
}
use of co.cask.cdap.api.dataset.lib.PartitionedFileSet in project cdap by caskdata.
the class PartitionedFileSetTest method testPartitionConsumingWithFilterAndLimit.
@Test
public void testPartitionConsumingWithFilterAndLimit() throws Exception {
final PartitionedFileSet dataset = dsFrameworkUtil.getInstance(pfsInstance);
final TransactionAware txAwareDataset = (TransactionAware) dataset;
final Set<PartitionKey> partitionKeys1 = Sets.newHashSet();
for (int i = 0; i < 10; i++) {
partitionKeys1.add(generateUniqueKey());
}
final Set<PartitionKey> partitionKeys2 = Sets.newHashSet();
for (int i = 0; i < 15; i++) {
partitionKeys2.add(generateUniqueKey());
}
final SimplePartitionConsumer partitionConsumer = new SimplePartitionConsumer(dataset);
// (consumption only happens at transaction borders)
for (final PartitionKey partitionKey : partitionKeys1) {
dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() throws Exception {
dataset.getPartitionOutput(partitionKey).addPartition();
}
});
}
dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() throws Exception {
// Initial consumption results in the partitions corresponding to partitionKeys1 to be consumed because only
// those partitions are added to the dataset at this point
List<Partition> consumedPartitions = Lists.newArrayList();
// with limit = 1, the returned iterator is only size 1, even though there are more unconsumed partitions
Iterables.addAll(consumedPartitions, partitionConsumer.consumePartitions(1));
Assert.assertEquals(1, consumedPartitions.size());
// ask for 5 more
Iterables.addAll(consumedPartitions, partitionConsumer.consumePartitions(5));
Assert.assertEquals(6, consumedPartitions.size());
// ask for 5 more, but there are only 4 more unconsumed partitions (size of partitionKeys1 is 10).
Iterables.addAll(consumedPartitions, partitionConsumer.consumePartitions(5));
Assert.assertEquals(10, consumedPartitions.size());
Set<PartitionKey> retrievedKeys = Sets.newHashSet();
for (Partition consumedPartition : consumedPartitions) {
retrievedKeys.add(consumedPartition.getPartitionKey());
}
Assert.assertEquals(partitionKeys1, retrievedKeys);
}
});
dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() throws Exception {
for (PartitionKey partitionKey : partitionKeys2) {
dataset.getPartitionOutput(partitionKey).addPartition();
}
}
});
dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() throws Exception {
// using the same PartitionConsumer (which remembers the PartitionConsumerState) to consume additional
// partitions results in only the newly added partitions (corresponding to partitionKeys2) to be returned
List<Partition> consumedPartitions = Lists.newArrayList();
Iterables.addAll(consumedPartitions, partitionConsumer.consumePartitions(1));
// even though we set limit to 1 in the previous call to consumePartitions, we get all the elements of
// partitionKeys2, because they were all added in the same transaction
Set<PartitionKey> retrievedKeys = Sets.newHashSet();
for (Partition consumedPartition : consumedPartitions) {
retrievedKeys.add(consumedPartition.getPartitionKey());
}
Assert.assertEquals(partitionKeys2, retrievedKeys);
}
});
dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() throws Exception {
// consuming the partitions again, without adding any new partitions returns an empty iterator
Assert.assertTrue(partitionConsumer.consumePartitions().isEmpty());
}
});
dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() throws Exception {
// creating a new PartitionConsumer resets the consumption state.
// test combination of filter and limit
SimplePartitionConsumer newPartitionConsumer = new SimplePartitionConsumer(dataset);
List<Partition> consumedPartitions = Lists.newArrayList();
// the partitionFilter will match partitionKeys [1, 7), of which there are 6
final PartitionFilter partitionFilter = PartitionFilter.builder().addRangeCondition("i", 1, 7).build();
final Predicate<PartitionDetail> predicate = new Predicate<PartitionDetail>() {
@Override
public boolean apply(PartitionDetail partitionDetail) {
return partitionFilter.match(partitionDetail.getPartitionKey());
}
};
// apply the filter (narrows it down to 6 elements) and apply a limit of 4 results in 4 consumed partitions
Iterables.addAll(consumedPartitions, newPartitionConsumer.consumePartitions(4, predicate));
Assert.assertEquals(4, consumedPartitions.size());
// apply a limit of 3, using the same filter returns the remaining 2 elements that fit that filter
Iterables.addAll(consumedPartitions, newPartitionConsumer.consumePartitions(3, predicate));
Assert.assertEquals(6, consumedPartitions.size());
// assert that the partitions returned have partition keys, where the i values range from [1, 7]
Set<Integer> expectedIFields = new HashSet<>();
for (int i = 1; i < 7; i++) {
expectedIFields.add(i);
}
Set<Integer> actualIFields = new HashSet<>();
for (Partition consumedPartition : consumedPartitions) {
actualIFields.add((Integer) consumedPartition.getPartitionKey().getField("i"));
}
Assert.assertEquals(expectedIFields, actualIFields);
}
});
}
Aggregations