Search in sources :

Example 1 with Partition

use of co.cask.cdap.api.dataset.lib.Partition in project cdap by caskdata.

the class PartitionConsumerTest method testPartitionPutback.

@Test
public void testPartitionPutback() throws Exception {
    final PartitionedFileSet dataset = dsFrameworkUtil.getInstance(pfsInstance);
    final TransactionAware txAwareDataset = (TransactionAware) dataset;
    final Set<PartitionKey> partitionKeys = new HashSet<>();
    for (int i = 0; i < 10; i++) {
        partitionKeys.add(generateUniqueKey());
    }
    final PartitionConsumer partitionConsumer = new ConcurrentPartitionConsumer(dataset, new InMemoryStatePersistor(), ConsumerConfiguration.builder().setMaxRetries(1).build());
    dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() throws Exception {
            for (PartitionKey partitionKey : partitionKeys) {
                dataset.getPartitionOutput(partitionKey).addPartition();
            }
        }
    });
    dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() throws Exception {
            // consume all the partitions
            List<? extends Partition> consumedPartitions = partitionConsumer.consumePartitions().getPartitions();
            Assert.assertEquals(partitionKeys, toKeys(consumedPartitions));
            // consuming the partitions again, without adding any new partitions returns an empty iterator
            Assert.assertTrue(partitionConsumer.consumePartitions().getPartitions().isEmpty());
            // and testing that they are still available for processing, and that there are no failed partitions
            for (int i = 0; i < 5; i++) {
                partitionConsumer.untake(consumedPartitions);
                PartitionConsumerResult result = partitionConsumer.consumePartitions();
                consumedPartitions = result.getPartitions();
                Assert.assertEquals(partitionKeys, toKeys(consumedPartitions));
                Assert.assertEquals(0, result.getFailedPartitions().size());
            }
            // consuming the partitions again, without adding any new partitions returns an empty iterator
            Assert.assertTrue(partitionConsumer.consumePartitions().getPartitions().isEmpty());
            // test functionality to put back a partial subset of the retrieved the partitions
            Partition firstConsumedPartition = consumedPartitions.get(0);
            // test the untakeWithKeys method
            partitionConsumer.untakeWithKeys(ImmutableList.of(firstConsumedPartition.getPartitionKey()));
            consumedPartitions = partitionConsumer.consumePartitions().getPartitions();
            Assert.assertEquals(1, consumedPartitions.size());
            Assert.assertEquals(firstConsumedPartition, consumedPartitions.get(0));
        }
    });
}
Also used : ConcurrentPartitionConsumer(co.cask.cdap.api.dataset.lib.partitioned.ConcurrentPartitionConsumer) ConsumablePartition(co.cask.cdap.api.dataset.lib.partitioned.ConsumablePartition) Partition(co.cask.cdap.api.dataset.lib.Partition) PartitionConsumerResult(co.cask.cdap.api.dataset.lib.partitioned.PartitionConsumerResult) PartitionedFileSet(co.cask.cdap.api.dataset.lib.PartitionedFileSet) TransactionExecutor(org.apache.tephra.TransactionExecutor) TransactionAware(org.apache.tephra.TransactionAware) PartitionKey(co.cask.cdap.api.dataset.lib.PartitionKey) ArrayList(java.util.ArrayList) ImmutableList(com.google.common.collect.ImmutableList) List(java.util.List) ConcurrentPartitionConsumer(co.cask.cdap.api.dataset.lib.partitioned.ConcurrentPartitionConsumer) PartitionConsumer(co.cask.cdap.api.dataset.lib.partitioned.PartitionConsumer) HashSet(java.util.HashSet) Test(org.junit.Test)

Example 2 with Partition

use of co.cask.cdap.api.dataset.lib.Partition in project cdap by caskdata.

the class PartitionConsumerTest method testSimplePartitionConsuming.

@Test
public void testSimplePartitionConsuming() throws Exception {
    final PartitionedFileSet dataset = dsFrameworkUtil.getInstance(pfsInstance);
    final TransactionAware txAwareDataset = (TransactionAware) dataset;
    final Set<PartitionKey> partitionKeys1 = new HashSet<>();
    for (int i = 0; i < 10; i++) {
        partitionKeys1.add(generateUniqueKey());
    }
    final Set<PartitionKey> partitionKeys2 = new HashSet<>();
    for (int i = 0; i < 15; i++) {
        partitionKeys2.add(generateUniqueKey());
    }
    final PartitionConsumer partitionConsumer = new ConcurrentPartitionConsumer(dataset, new InMemoryStatePersistor());
    dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() throws Exception {
            for (PartitionKey partitionKey : partitionKeys1) {
                dataset.getPartitionOutput(partitionKey).addPartition();
            }
        }
    });
    dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() throws Exception {
            // Initial consumption results in the partitions corresponding to partitionKeys1 to be consumed because only
            // those partitions are added to the dataset at this point
            List<? extends Partition> consumedPartitions = partitionConsumer.consumePartitions().getPartitions();
            Assert.assertEquals(partitionKeys1, toKeys(consumedPartitions));
        }
    });
    dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() throws Exception {
            for (PartitionKey partitionKey : partitionKeys2) {
                dataset.getPartitionOutput(partitionKey).addPartition();
            }
        }
    });
    dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() throws Exception {
            // using the same PartitionConsumer (which remembers the PartitionConsumerState) to consume additional
            // partitions results in only the newly added partitions (corresponding to partitionKeys2) to be returned
            Assert.assertEquals(partitionKeys2, toKeys(partitionConsumer.consumePartitions().getPartitions()));
        }
    });
    dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() throws Exception {
            // consuming the partitions again, without adding any new partitions returns an empty iterator
            Assert.assertTrue(partitionConsumer.consumePartitions().getPartitions().isEmpty());
        }
    });
    dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() throws Exception {
            // creating a new PartitionConsumer resets the consumption state. Consuming from it then returns an iterator
            // with all the partition keys
            List<? extends Partition> consumedPartitions = new ConcurrentPartitionConsumer(dataset, new InMemoryStatePersistor()).consumePartitions().getPartitions();
            Set<PartitionKey> allKeys = new HashSet<>();
            allKeys.addAll(partitionKeys1);
            allKeys.addAll(partitionKeys2);
            Assert.assertEquals(allKeys, toKeys(consumedPartitions));
        }
    });
}
Also used : ConcurrentPartitionConsumer(co.cask.cdap.api.dataset.lib.partitioned.ConcurrentPartitionConsumer) ConsumablePartition(co.cask.cdap.api.dataset.lib.partitioned.ConsumablePartition) Partition(co.cask.cdap.api.dataset.lib.Partition) HashSet(java.util.HashSet) PartitionedFileSet(co.cask.cdap.api.dataset.lib.PartitionedFileSet) Set(java.util.Set) ConsumerWorkingSet(co.cask.cdap.api.dataset.lib.partitioned.ConsumerWorkingSet) PartitionedFileSet(co.cask.cdap.api.dataset.lib.PartitionedFileSet) TransactionExecutor(org.apache.tephra.TransactionExecutor) TransactionAware(org.apache.tephra.TransactionAware) PartitionKey(co.cask.cdap.api.dataset.lib.PartitionKey) ArrayList(java.util.ArrayList) ImmutableList(com.google.common.collect.ImmutableList) List(java.util.List) ConcurrentPartitionConsumer(co.cask.cdap.api.dataset.lib.partitioned.ConcurrentPartitionConsumer) PartitionConsumer(co.cask.cdap.api.dataset.lib.partitioned.PartitionConsumer) HashSet(java.util.HashSet) Test(org.junit.Test)

Example 3 with Partition

use of co.cask.cdap.api.dataset.lib.Partition in project cdap by caskdata.

the class PartitionedFileSetTest method testPartitionConsumingWithFilterAndLimit.

@Test
public void testPartitionConsumingWithFilterAndLimit() throws Exception {
    final PartitionedFileSet dataset = dsFrameworkUtil.getInstance(pfsInstance);
    final TransactionAware txAwareDataset = (TransactionAware) dataset;
    final Set<PartitionKey> partitionKeys1 = Sets.newHashSet();
    for (int i = 0; i < 10; i++) {
        partitionKeys1.add(generateUniqueKey());
    }
    final Set<PartitionKey> partitionKeys2 = Sets.newHashSet();
    for (int i = 0; i < 15; i++) {
        partitionKeys2.add(generateUniqueKey());
    }
    final SimplePartitionConsumer partitionConsumer = new SimplePartitionConsumer(dataset);
    // (consumption only happens at transaction borders)
    for (final PartitionKey partitionKey : partitionKeys1) {
        dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() {

            @Override
            public void apply() throws Exception {
                dataset.getPartitionOutput(partitionKey).addPartition();
            }
        });
    }
    dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() throws Exception {
            // Initial consumption results in the partitions corresponding to partitionKeys1 to be consumed because only
            // those partitions are added to the dataset at this point
            List<Partition> consumedPartitions = Lists.newArrayList();
            // with limit = 1, the returned iterator is only size 1, even though there are more unconsumed partitions
            Iterables.addAll(consumedPartitions, partitionConsumer.consumePartitions(1));
            Assert.assertEquals(1, consumedPartitions.size());
            // ask for 5 more
            Iterables.addAll(consumedPartitions, partitionConsumer.consumePartitions(5));
            Assert.assertEquals(6, consumedPartitions.size());
            // ask for 5 more, but there are only 4 more unconsumed partitions (size of partitionKeys1 is 10).
            Iterables.addAll(consumedPartitions, partitionConsumer.consumePartitions(5));
            Assert.assertEquals(10, consumedPartitions.size());
            Set<PartitionKey> retrievedKeys = Sets.newHashSet();
            for (Partition consumedPartition : consumedPartitions) {
                retrievedKeys.add(consumedPartition.getPartitionKey());
            }
            Assert.assertEquals(partitionKeys1, retrievedKeys);
        }
    });
    dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() throws Exception {
            for (PartitionKey partitionKey : partitionKeys2) {
                dataset.getPartitionOutput(partitionKey).addPartition();
            }
        }
    });
    dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() throws Exception {
            // using the same PartitionConsumer (which remembers the PartitionConsumerState) to consume additional
            // partitions results in only the newly added partitions (corresponding to partitionKeys2) to be returned
            List<Partition> consumedPartitions = Lists.newArrayList();
            Iterables.addAll(consumedPartitions, partitionConsumer.consumePartitions(1));
            // even though we set limit to 1 in the previous call to consumePartitions, we get all the elements of
            // partitionKeys2, because they were all added in the same transaction
            Set<PartitionKey> retrievedKeys = Sets.newHashSet();
            for (Partition consumedPartition : consumedPartitions) {
                retrievedKeys.add(consumedPartition.getPartitionKey());
            }
            Assert.assertEquals(partitionKeys2, retrievedKeys);
        }
    });
    dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() throws Exception {
            // consuming the partitions again, without adding any new partitions returns an empty iterator
            Assert.assertTrue(partitionConsumer.consumePartitions().isEmpty());
        }
    });
    dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() throws Exception {
            // creating a new PartitionConsumer resets the consumption state.
            // test combination of filter and limit
            SimplePartitionConsumer newPartitionConsumer = new SimplePartitionConsumer(dataset);
            List<Partition> consumedPartitions = Lists.newArrayList();
            // the partitionFilter will match partitionKeys [1, 7), of which there are 6
            final PartitionFilter partitionFilter = PartitionFilter.builder().addRangeCondition("i", 1, 7).build();
            final Predicate<PartitionDetail> predicate = new Predicate<PartitionDetail>() {

                @Override
                public boolean apply(PartitionDetail partitionDetail) {
                    return partitionFilter.match(partitionDetail.getPartitionKey());
                }
            };
            // apply the filter (narrows it down to 6 elements) and apply a limit of 4 results in 4 consumed partitions
            Iterables.addAll(consumedPartitions, newPartitionConsumer.consumePartitions(4, predicate));
            Assert.assertEquals(4, consumedPartitions.size());
            // apply a limit of 3, using the same filter returns the remaining 2 elements that fit that filter
            Iterables.addAll(consumedPartitions, newPartitionConsumer.consumePartitions(3, predicate));
            Assert.assertEquals(6, consumedPartitions.size());
            // assert that the partitions returned have partition keys, where the i values range from [1, 7]
            Set<Integer> expectedIFields = new HashSet<>();
            for (int i = 1; i < 7; i++) {
                expectedIFields.add(i);
            }
            Set<Integer> actualIFields = new HashSet<>();
            for (Partition consumedPartition : consumedPartitions) {
                actualIFields.add((Integer) consumedPartition.getPartitionKey().getField("i"));
            }
            Assert.assertEquals(expectedIFields, actualIFields);
        }
    });
}
Also used : Partition(co.cask.cdap.api.dataset.lib.Partition) ImmutableSet(com.google.common.collect.ImmutableSet) Set(java.util.Set) HashSet(java.util.HashSet) PartitionedFileSet(co.cask.cdap.api.dataset.lib.PartitionedFileSet) PartitionedFileSet(co.cask.cdap.api.dataset.lib.PartitionedFileSet) TransactionExecutor(org.apache.tephra.TransactionExecutor) PartitionDetail(co.cask.cdap.api.dataset.lib.PartitionDetail) PartitionNotFoundException(co.cask.cdap.api.dataset.PartitionNotFoundException) IOException(java.io.IOException) DataSetException(co.cask.cdap.api.dataset.DataSetException) Predicate(co.cask.cdap.api.Predicate) PartitionFilter(co.cask.cdap.api.dataset.lib.PartitionFilter) TransactionAware(org.apache.tephra.TransactionAware) PartitionKey(co.cask.cdap.api.dataset.lib.PartitionKey) List(java.util.List) Test(org.junit.Test)

Example 4 with Partition

use of co.cask.cdap.api.dataset.lib.Partition in project cdap by caskdata.

the class MapReduceWithPartitionedTest method testTimePartitionedWithMR.

@Test
public void testTimePartitionedWithMR() throws Exception {
    final ApplicationWithPrograms app = deployApp(AppWithTimePartitionedFileSet.class);
    // write a value to the input table
    final Table table = datasetCache.getDataset(AppWithTimePartitionedFileSet.INPUT);
    Transactions.createTransactionExecutor(txExecutorFactory, (TransactionAware) table).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() {
            table.put(Bytes.toBytes("x"), AppWithTimePartitionedFileSet.ONLY_COLUMN, Bytes.toBytes("1"));
        }
    });
    final long time = DATE_FORMAT.parse("1/15/15 11:15 am").getTime();
    final long time5 = time + TimeUnit.MINUTES.toMillis(5);
    // run the partition writer m/r with this output partition time
    Map<String, String> runtimeArguments = Maps.newHashMap();
    Map<String, String> outputArgs = Maps.newHashMap();
    TimePartitionedFileSetArguments.setOutputPartitionTime(outputArgs, time);
    final ImmutableMap<String, String> assignedMetadata = ImmutableMap.of("region", "13", "data.source.name", "input", "data.source.type", "table");
    TimePartitionedFileSetArguments.setOutputPartitionMetadata(outputArgs, assignedMetadata);
    runtimeArguments.putAll(RuntimeArguments.addScope(Scope.DATASET, TIME_PARTITIONED, outputArgs));
    Assert.assertTrue(runProgram(app, AppWithTimePartitionedFileSet.PartitionWriter.class, new BasicArguments(runtimeArguments)));
    // this should have created a partition in the tpfs
    final TimePartitionedFileSet tpfs = datasetCache.getDataset(TIME_PARTITIONED);
    Transactions.createTransactionExecutor(txExecutorFactory, (TransactionAware) tpfs).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() {
            TimePartitionDetail partition = tpfs.getPartitionByTime(time);
            Assert.assertNotNull(partition);
            String path = partition.getRelativePath();
            Assert.assertNotNull(path);
            Assert.assertTrue(path.contains("2015-01-15/11-15"));
            Assert.assertEquals(assignedMetadata, partition.getMetadata().asMap());
        }
    });
    // delete the data in the input table and write a new row
    Transactions.createTransactionExecutor(txExecutorFactory, (TransactionAware) table).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() {
            table.delete(Bytes.toBytes("x"));
            table.put(Bytes.toBytes("y"), AppWithTimePartitionedFileSet.ONLY_COLUMN, Bytes.toBytes("2"));
        }
    });
    // now run the m/r again with a new partition time, say 5 minutes later
    TimePartitionedFileSetArguments.setOutputPartitionTime(outputArgs, time5);
    runtimeArguments.putAll(RuntimeArguments.addScope(Scope.DATASET, TIME_PARTITIONED, outputArgs));
    // make the mapreduce add the partition in destroy, to validate that this does not fail the job
    runtimeArguments.put(AppWithTimePartitionedFileSet.COMPAT_ADD_PARTITION, "true");
    Assert.assertTrue(runProgram(app, AppWithTimePartitionedFileSet.PartitionWriter.class, new BasicArguments(runtimeArguments)));
    // this should have created a partition in the tpfs
    Transactions.createTransactionExecutor(txExecutorFactory, (TransactionAware) tpfs).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() {
            Partition partition = tpfs.getPartitionByTime(time5);
            Assert.assertNotNull(partition);
            String path = partition.getRelativePath();
            Assert.assertNotNull(path);
            Assert.assertTrue(path.contains("2015-01-15/11-20"));
        }
    });
    // now run a map/reduce that reads all the partitions
    runtimeArguments = Maps.newHashMap();
    Map<String, String> inputArgs = Maps.newHashMap();
    TimePartitionedFileSetArguments.setInputStartTime(inputArgs, time - TimeUnit.MINUTES.toMillis(5));
    TimePartitionedFileSetArguments.setInputEndTime(inputArgs, time5 + TimeUnit.MINUTES.toMillis(5));
    runtimeArguments.putAll(RuntimeArguments.addScope(Scope.DATASET, TIME_PARTITIONED, inputArgs));
    runtimeArguments.put(AppWithTimePartitionedFileSet.ROW_TO_WRITE, "a");
    Assert.assertTrue(runProgram(app, AppWithTimePartitionedFileSet.PartitionReader.class, new BasicArguments(runtimeArguments)));
    // this should have read both partitions - and written both x and y to row a
    final Table output = datasetCache.getDataset(AppWithTimePartitionedFileSet.OUTPUT);
    Transactions.createTransactionExecutor(txExecutorFactory, (TransactionAware) output).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() {
            Row row = output.get(Bytes.toBytes("a"));
            Assert.assertEquals("1", row.getString("x"));
            Assert.assertEquals("2", row.getString("y"));
        }
    });
    // now run a map/reduce that reads a range of the partitions, namely the first one
    TimePartitionedFileSetArguments.setInputStartTime(inputArgs, time - TimeUnit.MINUTES.toMillis(5));
    TimePartitionedFileSetArguments.setInputEndTime(inputArgs, time + TimeUnit.MINUTES.toMillis(2));
    runtimeArguments.putAll(RuntimeArguments.addScope(Scope.DATASET, TIME_PARTITIONED, inputArgs));
    runtimeArguments.put(AppWithTimePartitionedFileSet.ROW_TO_WRITE, "b");
    Assert.assertTrue(runProgram(app, AppWithTimePartitionedFileSet.PartitionReader.class, new BasicArguments(runtimeArguments)));
    // this should have read the first partition only - and written only x to row b
    Transactions.createTransactionExecutor(txExecutorFactory, (TransactionAware) output).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() {
            Row row = output.get(Bytes.toBytes("b"));
            Assert.assertEquals("1", row.getString("x"));
            Assert.assertNull(row.get("y"));
        }
    });
    // now run a map/reduce that reads no partitions (because the range matches nothing)
    TimePartitionedFileSetArguments.setInputStartTime(inputArgs, time - TimeUnit.MINUTES.toMillis(10));
    TimePartitionedFileSetArguments.setInputEndTime(inputArgs, time - TimeUnit.MINUTES.toMillis(9));
    runtimeArguments.putAll(RuntimeArguments.addScope(Scope.DATASET, TIME_PARTITIONED, inputArgs));
    runtimeArguments.put(AppWithTimePartitionedFileSet.ROW_TO_WRITE, "n");
    Assert.assertTrue(runProgram(app, AppWithTimePartitionedFileSet.PartitionReader.class, new BasicArguments(runtimeArguments)));
    // this should have read no partitions - and written nothing to row n
    Transactions.createTransactionExecutor(txExecutorFactory, (TransactionAware) output).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() {
            Row row = output.get(Bytes.toBytes("n"));
            Assert.assertTrue(row.isEmpty());
        }
    });
}
Also used : Partition(co.cask.cdap.api.dataset.lib.Partition) Table(co.cask.cdap.api.dataset.table.Table) TransactionExecutor(org.apache.tephra.TransactionExecutor) ApplicationWithPrograms(co.cask.cdap.internal.app.deploy.pipeline.ApplicationWithPrograms) TransactionAware(org.apache.tephra.TransactionAware) BasicArguments(co.cask.cdap.internal.app.runtime.BasicArguments) TimePartitionDetail(co.cask.cdap.api.dataset.lib.TimePartitionDetail) Row(co.cask.cdap.api.dataset.table.Row) TimePartitionedFileSet(co.cask.cdap.api.dataset.lib.TimePartitionedFileSet) Test(org.junit.Test)

Example 5 with Partition

use of co.cask.cdap.api.dataset.lib.Partition in project cdap by caskdata.

the class MapReduceWithPartitionedTest method testPartitionedFileSetWithMR.

@Test
public void testPartitionedFileSetWithMR() throws Exception {
    final ApplicationWithPrograms app = deployApp(AppWithPartitionedFileSet.class);
    // write a value to the input table
    final Table table = datasetCache.getDataset(AppWithPartitionedFileSet.INPUT);
    Transactions.createTransactionExecutor(txExecutorFactory, (TransactionAware) table).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() {
            table.put(Bytes.toBytes("x"), AppWithPartitionedFileSet.ONLY_COLUMN, Bytes.toBytes("1"));
        }
    });
    // a partition key for the map/reduce output
    final PartitionKey keyX = PartitionKey.builder().addStringField("type", "x").addLongField("time", 150000L).build();
    // run the partition writer m/r with this output partition time
    Map<String, String> runtimeArguments = Maps.newHashMap();
    Map<String, String> outputArgs = Maps.newHashMap();
    PartitionedFileSetArguments.setOutputPartitionKey(outputArgs, keyX);
    runtimeArguments.putAll(RuntimeArguments.addScope(Scope.DATASET, PARTITIONED, outputArgs));
    Assert.assertTrue(runProgram(app, AppWithPartitionedFileSet.PartitionWriter.class, new BasicArguments(runtimeArguments)));
    // this should have created a partition in the tpfs
    final PartitionedFileSet dataset = datasetCache.getDataset(PARTITIONED);
    Transactions.createTransactionExecutor(txExecutorFactory, (TransactionAware) dataset).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() {
            Partition partition = dataset.getPartition(keyX);
            Assert.assertNotNull(partition);
            String path = partition.getRelativePath();
            Assert.assertTrue(path.contains("x"));
            Assert.assertTrue(path.contains("150000"));
        }
    });
    // delete the data in the input table and write a new row
    Transactions.createTransactionExecutor(txExecutorFactory, (TransactionAware) table).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() {
            table.delete(Bytes.toBytes("x"));
            table.put(Bytes.toBytes("y"), AppWithPartitionedFileSet.ONLY_COLUMN, Bytes.toBytes("2"));
        }
    });
    // a new partition key for the next map/reduce
    final PartitionKey keyY = PartitionKey.builder().addStringField("type", "y").addLongField("time", 200000L).build();
    // now run the m/r again with a new partition time, say 5 minutes later
    PartitionedFileSetArguments.setOutputPartitionKey(outputArgs, keyY);
    runtimeArguments.putAll(RuntimeArguments.addScope(Scope.DATASET, PARTITIONED, outputArgs));
    Assert.assertTrue(runProgram(app, AppWithPartitionedFileSet.PartitionWriter.class, new BasicArguments(runtimeArguments)));
    // this should have created a partition in the tpfs
    Transactions.createTransactionExecutor(txExecutorFactory, (TransactionAware) dataset).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() {
            Partition partition = dataset.getPartition(keyY);
            Assert.assertNotNull(partition);
            String path = partition.getRelativePath();
            Assert.assertNotNull(path);
            Assert.assertTrue(path.contains("y"));
            Assert.assertTrue(path.contains("200000"));
        }
    });
    // a partition filter that matches the outputs of both map/reduces
    PartitionFilter filterXY = PartitionFilter.builder().addRangeCondition("type", "x", "z").build();
    // now run a map/reduce that reads all the partitions
    runtimeArguments = Maps.newHashMap();
    Map<String, String> inputArgs = Maps.newHashMap();
    PartitionedFileSetArguments.setInputPartitionFilter(inputArgs, filterXY);
    runtimeArguments.putAll(RuntimeArguments.addScope(Scope.DATASET, PARTITIONED, inputArgs));
    runtimeArguments.put(AppWithPartitionedFileSet.ROW_TO_WRITE, "a");
    Assert.assertTrue(runProgram(app, AppWithPartitionedFileSet.PartitionReader.class, new BasicArguments(runtimeArguments)));
    // this should have read both partitions - and written both x and y to row a
    final Table output = datasetCache.getDataset(AppWithPartitionedFileSet.OUTPUT);
    Transactions.createTransactionExecutor(txExecutorFactory, (TransactionAware) output).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() {
            Row row = output.get(Bytes.toBytes("a"));
            Assert.assertEquals("1", row.getString("x"));
            Assert.assertEquals("2", row.getString("y"));
        }
    });
    // a partition filter that matches the output key of the first map/reduce
    PartitionFilter filterX = PartitionFilter.builder().addValueCondition("type", "x").addRangeCondition("time", null, 160000L).build();
    // now run a map/reduce that reads a range of the partitions, namely the first one
    inputArgs.clear();
    PartitionedFileSetArguments.setInputPartitionFilter(inputArgs, filterX);
    runtimeArguments.putAll(RuntimeArguments.addScope(Scope.DATASET, PARTITIONED, inputArgs));
    runtimeArguments.put(AppWithPartitionedFileSet.ROW_TO_WRITE, "b");
    Assert.assertTrue(runProgram(app, AppWithPartitionedFileSet.PartitionReader.class, new BasicArguments(runtimeArguments)));
    // this should have read the first partition only - and written only x to row b
    Transactions.createTransactionExecutor(txExecutorFactory, (TransactionAware) output).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() {
            Row row = output.get(Bytes.toBytes("b"));
            Assert.assertEquals("1", row.getString("x"));
            Assert.assertNull(row.get("y"));
        }
    });
    // a partition filter that matches no key
    PartitionFilter filterMT = PartitionFilter.builder().addValueCondition("type", "nosuchthing").build();
    // now run a map/reduce that reads an empty range of partitions (the filter matches nothing)
    inputArgs.clear();
    PartitionedFileSetArguments.setInputPartitionFilter(inputArgs, filterMT);
    runtimeArguments.putAll(RuntimeArguments.addScope(Scope.DATASET, PARTITIONED, inputArgs));
    runtimeArguments.put(AppWithPartitionedFileSet.ROW_TO_WRITE, "n");
    Assert.assertTrue(runProgram(app, AppWithPartitionedFileSet.PartitionReader.class, new BasicArguments(runtimeArguments)));
    // this should have read no partitions - and written nothing to row n
    Transactions.createTransactionExecutor(txExecutorFactory, (TransactionAware) output).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() {
            Row row = output.get(Bytes.toBytes("n"));
            Assert.assertTrue(row.isEmpty());
        }
    });
}
Also used : Partition(co.cask.cdap.api.dataset.lib.Partition) Table(co.cask.cdap.api.dataset.table.Table) TransactionExecutor(org.apache.tephra.TransactionExecutor) TimePartitionedFileSet(co.cask.cdap.api.dataset.lib.TimePartitionedFileSet) PartitionedFileSet(co.cask.cdap.api.dataset.lib.PartitionedFileSet) PartitionFilter(co.cask.cdap.api.dataset.lib.PartitionFilter) ApplicationWithPrograms(co.cask.cdap.internal.app.deploy.pipeline.ApplicationWithPrograms) TransactionAware(org.apache.tephra.TransactionAware) PartitionKey(co.cask.cdap.api.dataset.lib.PartitionKey) BasicArguments(co.cask.cdap.internal.app.runtime.BasicArguments) Row(co.cask.cdap.api.dataset.table.Row) Test(org.junit.Test)

Aggregations

Partition (co.cask.cdap.api.dataset.lib.Partition)8 Test (org.junit.Test)8 PartitionKey (co.cask.cdap.api.dataset.lib.PartitionKey)7 TransactionAware (org.apache.tephra.TransactionAware)7 TransactionExecutor (org.apache.tephra.TransactionExecutor)7 PartitionedFileSet (co.cask.cdap.api.dataset.lib.PartitionedFileSet)6 HashSet (java.util.HashSet)5 List (java.util.List)5 Set (java.util.Set)4 PartitionFilter (co.cask.cdap.api.dataset.lib.PartitionFilter)3 ConcurrentPartitionConsumer (co.cask.cdap.api.dataset.lib.partitioned.ConcurrentPartitionConsumer)3 ConsumablePartition (co.cask.cdap.api.dataset.lib.partitioned.ConsumablePartition)3 PartitionConsumer (co.cask.cdap.api.dataset.lib.partitioned.PartitionConsumer)3 ImmutableList (com.google.common.collect.ImmutableList)3 ArrayList (java.util.ArrayList)3 Predicate (co.cask.cdap.api.Predicate)2 DataSetException (co.cask.cdap.api.dataset.DataSetException)2 PartitionNotFoundException (co.cask.cdap.api.dataset.PartitionNotFoundException)2 PartitionDetail (co.cask.cdap.api.dataset.lib.PartitionDetail)2 TimePartitionedFileSet (co.cask.cdap.api.dataset.lib.TimePartitionedFileSet)2