Search in sources :

Example 51 with PartitionedFileSet

use of io.cdap.cdap.api.dataset.lib.PartitionedFileSet in project cdap by cdapio.

the class PartitionConsumerTest method testDroppedPartitions.

@Test
public void testDroppedPartitions() throws Exception {
    // Tests the case of a partition in the partition consumer working set being dropped from the Partitioned
    // FileSet (See CDAP-6215)
    final PartitionedFileSet dataset = dsFrameworkUtil.getInstance(pfsInstance);
    final TransactionAware txAwareDataset = (TransactionAware) dataset;
    ConsumerConfiguration configuration = ConsumerConfiguration.builder().setMaxWorkingSetSize(1).setMaxRetries(2).build();
    final PartitionConsumer partitionConsumer = new ConcurrentPartitionConsumer(dataset, new InMemoryStatePersistor(), configuration);
    final PartitionKey partitionKey1 = generateUniqueKey();
    final PartitionKey partitionKey2 = generateUniqueKey();
    // Note: These two partitions are added in separate transactions, so that the first can exist in the working set
    // without the second. Partitions in the same transaction can not be split up (due to their index being the same)
    dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() throws Exception {
            dataset.getPartitionOutput(partitionKey1).addPartition();
        }
    });
    dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() throws Exception {
            dataset.getPartitionOutput(partitionKey2).addPartition();
        }
    });
    dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() throws Exception {
            // consuming and aborting the partition numRetries times plus one (for the first attempt) makes it get removed
            // from the working set
            List<PartitionDetail> partitionDetails = partitionConsumer.consumePartitions(1).getPartitions();
            Assert.assertEquals(1, partitionDetails.size());
            Assert.assertEquals(partitionKey1, partitionDetails.get(0).getPartitionKey());
            // aborting the processing of the partition, to put it back in the working set
            partitionConsumer.onFinish(partitionDetails, false);
        }
    });
    // dropping partitionKey1 from the dataset makes it no longer available for consuming
    dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() throws Exception {
            dataset.dropPartition(partitionKey1);
        }
    });
    dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() throws Exception {
            // first call to consume will drop the partition from the working set, and return nothing, since it was
            // the only partition in the working set
            PartitionConsumerResult result = partitionConsumer.consumePartitions(1);
            Assert.assertEquals(0, result.getPartitions().size());
            Assert.assertEquals(0, result.getFailedPartitions().size());
            // following calls to consumePartitions will repopulate the working set and return additional partition(s)
            result = partitionConsumer.consumePartitions(1);
            Assert.assertEquals(1, result.getPartitions().size());
            Assert.assertEquals(partitionKey2, result.getPartitions().get(0).getPartitionKey());
        }
    });
}
Also used : ConcurrentPartitionConsumer(io.cdap.cdap.api.dataset.lib.partitioned.ConcurrentPartitionConsumer) PartitionConsumerResult(io.cdap.cdap.api.dataset.lib.partitioned.PartitionConsumerResult) PartitionedFileSet(io.cdap.cdap.api.dataset.lib.PartitionedFileSet) TransactionExecutor(org.apache.tephra.TransactionExecutor) TransactionAware(org.apache.tephra.TransactionAware) ConsumerConfiguration(io.cdap.cdap.api.dataset.lib.partitioned.ConsumerConfiguration) PartitionKey(io.cdap.cdap.api.dataset.lib.PartitionKey) ArrayList(java.util.ArrayList) ImmutableList(com.google.common.collect.ImmutableList) List(java.util.List) PartitionConsumer(io.cdap.cdap.api.dataset.lib.partitioned.PartitionConsumer) ConcurrentPartitionConsumer(io.cdap.cdap.api.dataset.lib.partitioned.ConcurrentPartitionConsumer) Test(org.junit.Test)

Example 52 with PartitionedFileSet

use of io.cdap.cdap.api.dataset.lib.PartitionedFileSet in project cdap by cdapio.

the class PartitionConsumerTest method testSimplePartitionConsuming.

@Test
public void testSimplePartitionConsuming() throws Exception {
    final PartitionedFileSet dataset = dsFrameworkUtil.getInstance(pfsInstance);
    final TransactionAware txAwareDataset = (TransactionAware) dataset;
    final Set<PartitionKey> partitionKeys1 = new HashSet<>();
    for (int i = 0; i < 10; i++) {
        partitionKeys1.add(generateUniqueKey());
    }
    final Set<PartitionKey> partitionKeys2 = new HashSet<>();
    for (int i = 0; i < 15; i++) {
        partitionKeys2.add(generateUniqueKey());
    }
    final PartitionConsumer partitionConsumer = new ConcurrentPartitionConsumer(dataset, new InMemoryStatePersistor());
    dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() throws Exception {
            for (PartitionKey partitionKey : partitionKeys1) {
                dataset.getPartitionOutput(partitionKey).addPartition();
            }
        }
    });
    dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() throws Exception {
            // Initial consumption results in the partitions corresponding to partitionKeys1 to be consumed because only
            // those partitions are added to the dataset at this point
            List<? extends Partition> consumedPartitions = partitionConsumer.consumePartitions().getPartitions();
            Assert.assertEquals(partitionKeys1, toKeys(consumedPartitions));
        }
    });
    dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() throws Exception {
            for (PartitionKey partitionKey : partitionKeys2) {
                dataset.getPartitionOutput(partitionKey).addPartition();
            }
        }
    });
    dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() throws Exception {
            // using the same PartitionConsumer (which remembers the PartitionConsumerState) to consume additional
            // partitions results in only the newly added partitions (corresponding to partitionKeys2) to be returned
            Assert.assertEquals(partitionKeys2, toKeys(partitionConsumer.consumePartitions().getPartitions()));
        }
    });
    dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() throws Exception {
            // consuming the partitions again, without adding any new partitions returns an empty iterator
            Assert.assertTrue(partitionConsumer.consumePartitions().getPartitions().isEmpty());
        }
    });
    dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() throws Exception {
            // creating a new PartitionConsumer resets the consumption state. Consuming from it then returns an iterator
            // with all the partition keys
            List<? extends Partition> consumedPartitions = new ConcurrentPartitionConsumer(dataset, new InMemoryStatePersistor()).consumePartitions().getPartitions();
            Set<PartitionKey> allKeys = new HashSet<>();
            allKeys.addAll(partitionKeys1);
            allKeys.addAll(partitionKeys2);
            Assert.assertEquals(allKeys, toKeys(consumedPartitions));
        }
    });
}
Also used : ConcurrentPartitionConsumer(io.cdap.cdap.api.dataset.lib.partitioned.ConcurrentPartitionConsumer) ConsumablePartition(io.cdap.cdap.api.dataset.lib.partitioned.ConsumablePartition) Partition(io.cdap.cdap.api.dataset.lib.Partition) ConsumerWorkingSet(io.cdap.cdap.api.dataset.lib.partitioned.ConsumerWorkingSet) HashSet(java.util.HashSet) PartitionedFileSet(io.cdap.cdap.api.dataset.lib.PartitionedFileSet) Set(java.util.Set) PartitionedFileSet(io.cdap.cdap.api.dataset.lib.PartitionedFileSet) TransactionExecutor(org.apache.tephra.TransactionExecutor) TransactionAware(org.apache.tephra.TransactionAware) PartitionKey(io.cdap.cdap.api.dataset.lib.PartitionKey) ArrayList(java.util.ArrayList) ImmutableList(com.google.common.collect.ImmutableList) List(java.util.List) PartitionConsumer(io.cdap.cdap.api.dataset.lib.partitioned.PartitionConsumer) ConcurrentPartitionConsumer(io.cdap.cdap.api.dataset.lib.partitioned.ConcurrentPartitionConsumer) HashSet(java.util.HashSet) Test(org.junit.Test)

Example 53 with PartitionedFileSet

use of io.cdap.cdap.api.dataset.lib.PartitionedFileSet in project cdap by cdapio.

the class PartitionConsumerTest method testPartitionPutback.

@Test
public void testPartitionPutback() throws Exception {
    final PartitionedFileSet dataset = dsFrameworkUtil.getInstance(pfsInstance);
    final TransactionAware txAwareDataset = (TransactionAware) dataset;
    final Set<PartitionKey> partitionKeys = new HashSet<>();
    for (int i = 0; i < 10; i++) {
        partitionKeys.add(generateUniqueKey());
    }
    final PartitionConsumer partitionConsumer = new ConcurrentPartitionConsumer(dataset, new InMemoryStatePersistor(), ConsumerConfiguration.builder().setMaxRetries(1).build());
    dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() throws Exception {
            for (PartitionKey partitionKey : partitionKeys) {
                dataset.getPartitionOutput(partitionKey).addPartition();
            }
        }
    });
    dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() throws Exception {
            // consume all the partitions
            List<? extends Partition> consumedPartitions = partitionConsumer.consumePartitions().getPartitions();
            Assert.assertEquals(partitionKeys, toKeys(consumedPartitions));
            // consuming the partitions again, without adding any new partitions returns an empty iterator
            Assert.assertTrue(partitionConsumer.consumePartitions().getPartitions().isEmpty());
            // and testing that they are still available for processing, and that there are no failed partitions
            for (int i = 0; i < 5; i++) {
                partitionConsumer.untake(consumedPartitions);
                PartitionConsumerResult result = partitionConsumer.consumePartitions();
                consumedPartitions = result.getPartitions();
                Assert.assertEquals(partitionKeys, toKeys(consumedPartitions));
                Assert.assertEquals(0, result.getFailedPartitions().size());
            }
            // consuming the partitions again, without adding any new partitions returns an empty iterator
            Assert.assertTrue(partitionConsumer.consumePartitions().getPartitions().isEmpty());
            // test functionality to put back a partial subset of the retrieved the partitions
            Partition firstConsumedPartition = consumedPartitions.get(0);
            // test the untakeWithKeys method
            partitionConsumer.untakeWithKeys(ImmutableList.of(firstConsumedPartition.getPartitionKey()));
            consumedPartitions = partitionConsumer.consumePartitions().getPartitions();
            Assert.assertEquals(1, consumedPartitions.size());
            Assert.assertEquals(firstConsumedPartition, consumedPartitions.get(0));
        }
    });
}
Also used : ConcurrentPartitionConsumer(io.cdap.cdap.api.dataset.lib.partitioned.ConcurrentPartitionConsumer) ConsumablePartition(io.cdap.cdap.api.dataset.lib.partitioned.ConsumablePartition) Partition(io.cdap.cdap.api.dataset.lib.Partition) PartitionConsumerResult(io.cdap.cdap.api.dataset.lib.partitioned.PartitionConsumerResult) PartitionedFileSet(io.cdap.cdap.api.dataset.lib.PartitionedFileSet) TransactionExecutor(org.apache.tephra.TransactionExecutor) TransactionAware(org.apache.tephra.TransactionAware) PartitionKey(io.cdap.cdap.api.dataset.lib.PartitionKey) ArrayList(java.util.ArrayList) ImmutableList(com.google.common.collect.ImmutableList) List(java.util.List) PartitionConsumer(io.cdap.cdap.api.dataset.lib.partitioned.PartitionConsumer) ConcurrentPartitionConsumer(io.cdap.cdap.api.dataset.lib.partitioned.ConcurrentPartitionConsumer) HashSet(java.util.HashSet) Test(org.junit.Test)

Example 54 with PartitionedFileSet

use of io.cdap.cdap.api.dataset.lib.PartitionedFileSet in project cdap by cdapio.

the class PartitionConsumerTest method testPartitionConsumer.

@Test
public void testPartitionConsumer() throws Exception {
    // exercises the edge case of partition consumption, when partitions are being consumed, while another in-progress
    // transaction has added a partition, but it has not yet committed, so the partition is not available for the
    // consumer
    PartitionedFileSet dataset1 = dsFrameworkUtil.getInstance(pfsInstance);
    PartitionedFileSet dataset2 = dsFrameworkUtil.getInstance(pfsInstance);
    TransactionManager txManager = dsFrameworkUtil.getTxManager();
    InMemoryTxSystemClient txClient = new InMemoryTxSystemClient(txManager);
    // producer simply adds initial partition
    TransactionContext txContext1 = new TransactionContext(txClient, (TransactionAware) dataset1);
    txContext1.start();
    PartitionKey partitionKey1 = generateUniqueKey();
    dataset1.getPartitionOutput(partitionKey1).addPartition();
    txContext1.finish();
    // consumer simply consumes initial partition
    TransactionContext txContext2 = new TransactionContext(txClient, (TransactionAware) dataset2);
    txContext2.start();
    PartitionConsumer partitionConsumer = new ConcurrentPartitionConsumer(dataset2, new InMemoryStatePersistor());
    List<? extends PartitionDetail> partitionIterator = partitionConsumer.consumePartitions().getPartitions();
    Assert.assertEquals(1, partitionIterator.size());
    Assert.assertEquals(partitionKey1, partitionIterator.get(0).getPartitionKey());
    txContext2.finish();
    // producer adds a second partition, but does not yet commit the transaction
    txContext1.start();
    PartitionKey partitionKey2 = generateUniqueKey();
    dataset1.getPartitionOutput(partitionKey2).addPartition();
    // consumer attempts to consume at a time after the partition was added, but before it committed. Because of this,
    // the partition is not visible and will not be consumed
    txContext2.start();
    Assert.assertTrue(partitionConsumer.consumePartitions().getPartitions().isEmpty());
    txContext2.finish();
    // producer commits the transaction in which the second partition was added
    txContext1.finish();
    // the next time the consumer runs, it processes the second partition
    txContext2.start();
    partitionIterator = partitionConsumer.consumePartitions().getPartitions();
    Assert.assertEquals(1, partitionIterator.size());
    Assert.assertEquals(partitionKey2, partitionIterator.get(0).getPartitionKey());
    txContext2.finish();
}
Also used : ConcurrentPartitionConsumer(io.cdap.cdap.api.dataset.lib.partitioned.ConcurrentPartitionConsumer) TransactionManager(org.apache.tephra.TransactionManager) TransactionContext(org.apache.tephra.TransactionContext) PartitionKey(io.cdap.cdap.api.dataset.lib.PartitionKey) PartitionedFileSet(io.cdap.cdap.api.dataset.lib.PartitionedFileSet) PartitionConsumer(io.cdap.cdap.api.dataset.lib.partitioned.PartitionConsumer) ConcurrentPartitionConsumer(io.cdap.cdap.api.dataset.lib.partitioned.ConcurrentPartitionConsumer) InMemoryTxSystemClient(org.apache.tephra.inmemory.InMemoryTxSystemClient) Test(org.junit.Test)

Example 55 with PartitionedFileSet

use of io.cdap.cdap.api.dataset.lib.PartitionedFileSet in project cdap by cdapio.

the class PartitionConsumerTest method testConsumeAfterDelete.

@Test
public void testConsumeAfterDelete() throws Exception {
    final PartitionedFileSet dataset = dsFrameworkUtil.getInstance(pfsInstance);
    final TransactionAware txAwareDataset = (TransactionAware) dataset;
    final Set<PartitionKey> partitionKeys1 = new HashSet<>();
    for (int i = 0; i < 3; i++) {
        partitionKeys1.add(generateUniqueKey());
    }
    // need to ensure that our consumerConfiguration is larger than the amount we consume initially, so that
    // additional partitions (which will be deleted afterwards) are brought into the working set
    ConsumerConfiguration consumerConfiguration = ConsumerConfiguration.builder().setMaxWorkingSetSize(100).build();
    final PartitionConsumer partitionConsumer = new ConcurrentPartitionConsumer(dataset, new InMemoryStatePersistor(), consumerConfiguration);
    dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() throws Exception {
            for (PartitionKey partitionKey : partitionKeys1) {
                dataset.getPartitionOutput(partitionKey).addPartition();
            }
        }
    });
    dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() throws Exception {
            // and not consumed
            for (int i = 0; i < 2; i++) {
                dataset.getPartitionOutput(generateUniqueKey()).addPartition();
            }
        }
    });
    dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() throws Exception {
            // consume 3 of the 5 initial partitions
            Assert.assertEquals(partitionKeys1, toKeys(partitionConsumer.consumePartitions(3).getPartitions()));
        }
    });
    final Set<PartitionKey> partitionKeys2 = new HashSet<>();
    for (int i = 0; i < 5; i++) {
        partitionKeys2.add(generateUniqueKey());
    }
    dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() throws Exception {
            // drop all existing partitions (2 of which are not consumed)
            for (PartitionDetail partitionDetail : dataset.getPartitions(PartitionFilter.ALWAYS_MATCH)) {
                dataset.dropPartition(partitionDetail.getPartitionKey());
            }
            // add 5 new ones
            for (PartitionKey partitionKey : partitionKeys2) {
                dataset.getPartitionOutput(partitionKey).addPartition();
            }
        }
    });
    dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() throws Exception {
            // the consumed partition keys should correspond to partitionKeys2, and not include the dropped, but unconsumed
            // partitions added before them
            Assert.assertEquals(partitionKeys2, toKeys(partitionConsumer.consumePartitions().getPartitions()));
        }
    });
    dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() throws Exception {
            // consuming the partitions again, without adding any new partitions returns an empty iterator
            Assert.assertTrue(partitionConsumer.consumePartitions().getPartitions().isEmpty());
        }
    });
    dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() throws Exception {
            // creating a new PartitionConsumer resets the consumption state. Consuming from it then returns an iterator
            // with all the partition keys added after the deletions
            ConcurrentPartitionConsumer partitionConsumer2 = new ConcurrentPartitionConsumer(dataset, new InMemoryStatePersistor());
            Assert.assertEquals(partitionKeys2, toKeys(partitionConsumer2.consumePartitions().getPartitions()));
        }
    });
}
Also used : ConcurrentPartitionConsumer(io.cdap.cdap.api.dataset.lib.partitioned.ConcurrentPartitionConsumer) PartitionedFileSet(io.cdap.cdap.api.dataset.lib.PartitionedFileSet) TransactionExecutor(org.apache.tephra.TransactionExecutor) PartitionDetail(io.cdap.cdap.api.dataset.lib.PartitionDetail) TransactionAware(org.apache.tephra.TransactionAware) ConsumerConfiguration(io.cdap.cdap.api.dataset.lib.partitioned.ConsumerConfiguration) PartitionKey(io.cdap.cdap.api.dataset.lib.PartitionKey) PartitionConsumer(io.cdap.cdap.api.dataset.lib.partitioned.PartitionConsumer) ConcurrentPartitionConsumer(io.cdap.cdap.api.dataset.lib.partitioned.ConcurrentPartitionConsumer) HashSet(java.util.HashSet) Test(org.junit.Test)

Aggregations

PartitionedFileSet (io.cdap.cdap.api.dataset.lib.PartitionedFileSet)112 Test (org.junit.Test)75 PartitionKey (io.cdap.cdap.api.dataset.lib.PartitionKey)53 Location (org.apache.twill.filesystem.Location)47 TransactionAware (org.apache.tephra.TransactionAware)44 TransactionExecutor (org.apache.tephra.TransactionExecutor)44 PartitionDetail (io.cdap.cdap.api.dataset.lib.PartitionDetail)28 IOException (java.io.IOException)26 DataSetException (io.cdap.cdap.api.dataset.DataSetException)24 FileSet (io.cdap.cdap.api.dataset.lib.FileSet)24 List (java.util.List)24 PartitionNotFoundException (io.cdap.cdap.api.dataset.PartitionNotFoundException)22 PartitionAlreadyExistsException (io.cdap.cdap.api.dataset.lib.PartitionAlreadyExistsException)22 ConcurrentPartitionConsumer (io.cdap.cdap.api.dataset.lib.partitioned.ConcurrentPartitionConsumer)22 PartitionConsumer (io.cdap.cdap.api.dataset.lib.partitioned.PartitionConsumer)22 HashSet (java.util.HashSet)19 ImmutableList (com.google.common.collect.ImmutableList)18 ArrayList (java.util.ArrayList)18 TimePartitionedFileSet (io.cdap.cdap.api.dataset.lib.TimePartitionedFileSet)16 TransactionContext (org.apache.tephra.TransactionContext)16