use of co.cask.cdap.api.dataset.lib.PartitionedFileSet in project cdap by caskdata.
the class PartitionedFileSetDefinition method getDataset.
@Override
public PartitionedFileSet getDataset(DatasetContext datasetContext, DatasetSpecification spec, Map<String, String> arguments, ClassLoader classLoader) throws IOException {
// properties must contain the partitioning
Partitioning partitioning = PartitionedFileSetProperties.getPartitioning(spec.getProperties());
// make any necessary updates to the arguments
arguments = updateArgumentsIfNeeded(arguments, partitioning);
FileSet fileset = filesetDef.getDataset(datasetContext, spec.getSpecification(FILESET_NAME), arguments, classLoader);
IndexedTable table = indexedTableDef.getDataset(datasetContext, spec.getSpecification(PARTITION_TABLE_NAME), arguments, classLoader);
return new PartitionedFileSetDataset(datasetContext, spec.getName(), partitioning, fileset, table, spec, arguments, getExploreProvider());
}
use of co.cask.cdap.api.dataset.lib.PartitionedFileSet in project cdap by caskdata.
the class TimePartitionedFileSetDefinition method getDataset.
@Override
public PartitionedFileSet getDataset(DatasetContext datasetContext, DatasetSpecification spec, Map<String, String> arguments, ClassLoader classLoader) throws IOException {
// make any necessary updates to the arguments
arguments = updateArgumentsIfNeeded(arguments);
FileSet fileset = filesetDef.getDataset(datasetContext, spec.getSpecification(FILESET_NAME), arguments, classLoader);
IndexedTable table = indexedTableDef.getDataset(datasetContext, spec.getSpecification(PARTITION_TABLE_NAME), arguments, classLoader);
return new TimePartitionedFileSetDataset(datasetContext, spec.getName(), fileset, table, spec, arguments, getExploreProvider());
}
use of co.cask.cdap.api.dataset.lib.PartitionedFileSet in project cdap by caskdata.
the class PartitionConsumerTest method testSimplePartitionConsuming.
@Test
public void testSimplePartitionConsuming() throws Exception {
final PartitionedFileSet dataset = dsFrameworkUtil.getInstance(pfsInstance);
final TransactionAware txAwareDataset = (TransactionAware) dataset;
final Set<PartitionKey> partitionKeys1 = new HashSet<>();
for (int i = 0; i < 10; i++) {
partitionKeys1.add(generateUniqueKey());
}
final Set<PartitionKey> partitionKeys2 = new HashSet<>();
for (int i = 0; i < 15; i++) {
partitionKeys2.add(generateUniqueKey());
}
final PartitionConsumer partitionConsumer = new ConcurrentPartitionConsumer(dataset, new InMemoryStatePersistor());
dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() throws Exception {
for (PartitionKey partitionKey : partitionKeys1) {
dataset.getPartitionOutput(partitionKey).addPartition();
}
}
});
dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() throws Exception {
// Initial consumption results in the partitions corresponding to partitionKeys1 to be consumed because only
// those partitions are added to the dataset at this point
List<? extends Partition> consumedPartitions = partitionConsumer.consumePartitions().getPartitions();
Assert.assertEquals(partitionKeys1, toKeys(consumedPartitions));
}
});
dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() throws Exception {
for (PartitionKey partitionKey : partitionKeys2) {
dataset.getPartitionOutput(partitionKey).addPartition();
}
}
});
dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() throws Exception {
// using the same PartitionConsumer (which remembers the PartitionConsumerState) to consume additional
// partitions results in only the newly added partitions (corresponding to partitionKeys2) to be returned
Assert.assertEquals(partitionKeys2, toKeys(partitionConsumer.consumePartitions().getPartitions()));
}
});
dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() throws Exception {
// consuming the partitions again, without adding any new partitions returns an empty iterator
Assert.assertTrue(partitionConsumer.consumePartitions().getPartitions().isEmpty());
}
});
dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() throws Exception {
// creating a new PartitionConsumer resets the consumption state. Consuming from it then returns an iterator
// with all the partition keys
List<? extends Partition> consumedPartitions = new ConcurrentPartitionConsumer(dataset, new InMemoryStatePersistor()).consumePartitions().getPartitions();
Set<PartitionKey> allKeys = new HashSet<>();
allKeys.addAll(partitionKeys1);
allKeys.addAll(partitionKeys2);
Assert.assertEquals(allKeys, toKeys(consumedPartitions));
}
});
}
use of co.cask.cdap.api.dataset.lib.PartitionedFileSet in project cdap by caskdata.
the class PartitionConsumerTest method testNumRetries.
@Test
public void testNumRetries() throws Exception {
final PartitionedFileSet dataset = dsFrameworkUtil.getInstance(pfsInstance);
final TransactionAware txAwareDataset = (TransactionAware) dataset;
final int numRetries = 1;
ConsumerConfiguration configuration = ConsumerConfiguration.builder().setMaxRetries(numRetries).build();
final PartitionConsumer partitionConsumer = new ConcurrentPartitionConsumer(dataset, new InMemoryStatePersistor(), configuration);
final PartitionKey partitionKey = generateUniqueKey();
dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() throws Exception {
dataset.getPartitionOutput(partitionKey).addPartition();
}
});
dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() throws Exception {
// from the working set
for (int i = 0; i < numRetries + 1; i++) {
List<PartitionDetail> partitionDetails = partitionConsumer.consumePartitions(1).getPartitions();
Assert.assertEquals(1, partitionDetails.size());
Assert.assertEquals(partitionKey, partitionDetails.get(0).getPartitionKey());
// aborting the processing of the partition
partitionConsumer.onFinish(partitionDetails, false);
}
// after the 2nd abort, the partition is discarded entirely, and so no partitions are available for consuming
PartitionConsumerResult result = partitionConsumer.consumePartitions(1);
Assert.assertEquals(0, result.getPartitions().size());
Assert.assertEquals(1, result.getFailedPartitions().size());
Assert.assertEquals(partitionKey, result.getFailedPartitions().get(0).getPartitionKey());
}
});
}
use of co.cask.cdap.api.dataset.lib.PartitionedFileSet in project cdap by caskdata.
the class PartitionConsumerTest method testPartitionPutback.
@Test
public void testPartitionPutback() throws Exception {
final PartitionedFileSet dataset = dsFrameworkUtil.getInstance(pfsInstance);
final TransactionAware txAwareDataset = (TransactionAware) dataset;
final Set<PartitionKey> partitionKeys = new HashSet<>();
for (int i = 0; i < 10; i++) {
partitionKeys.add(generateUniqueKey());
}
final PartitionConsumer partitionConsumer = new ConcurrentPartitionConsumer(dataset, new InMemoryStatePersistor(), ConsumerConfiguration.builder().setMaxRetries(1).build());
dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() throws Exception {
for (PartitionKey partitionKey : partitionKeys) {
dataset.getPartitionOutput(partitionKey).addPartition();
}
}
});
dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() throws Exception {
// consume all the partitions
List<? extends Partition> consumedPartitions = partitionConsumer.consumePartitions().getPartitions();
Assert.assertEquals(partitionKeys, toKeys(consumedPartitions));
// consuming the partitions again, without adding any new partitions returns an empty iterator
Assert.assertTrue(partitionConsumer.consumePartitions().getPartitions().isEmpty());
// and testing that they are still available for processing, and that there are no failed partitions
for (int i = 0; i < 5; i++) {
partitionConsumer.untake(consumedPartitions);
PartitionConsumerResult result = partitionConsumer.consumePartitions();
consumedPartitions = result.getPartitions();
Assert.assertEquals(partitionKeys, toKeys(consumedPartitions));
Assert.assertEquals(0, result.getFailedPartitions().size());
}
// consuming the partitions again, without adding any new partitions returns an empty iterator
Assert.assertTrue(partitionConsumer.consumePartitions().getPartitions().isEmpty());
// test functionality to put back a partial subset of the retrieved the partitions
Partition firstConsumedPartition = consumedPartitions.get(0);
// test the untakeWithKeys method
partitionConsumer.untakeWithKeys(ImmutableList.of(firstConsumedPartition.getPartitionKey()));
consumedPartitions = partitionConsumer.consumePartitions().getPartitions();
Assert.assertEquals(1, consumedPartitions.size());
Assert.assertEquals(firstConsumedPartition, consumedPartitions.get(0));
}
});
}
Aggregations