use of co.cask.cdap.api.dataset.lib.PartitionDetail in project cdap by caskdata.
the class DataCleansingMapReduceTest method getDataFromFilter.
private Set<String> getDataFromFilter(PartitionedFileSet partitionedFileSet, PartitionFilter filter) throws IOException {
Set<PartitionDetail> partitions = partitionedFileSet.getPartitions(filter);
Set<String> cleanData = new HashSet<>();
for (PartitionDetail partition : partitions) {
Assert.assertEquals(ImmutableMap.of("source.program", "DataCleansingMapReduce"), partition.getMetadata().asMap());
Location partitionLocation = partition.getLocation();
for (Location location : partitionLocation.list()) {
if (location.getName().startsWith("part-")) {
try (BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(location.getInputStream()))) {
String line;
while ((line = bufferedReader.readLine()) != null) {
cleanData.add(line);
}
}
}
}
}
return cleanData;
}
use of co.cask.cdap.api.dataset.lib.PartitionDetail in project cdap by caskdata.
the class PartitionConsumerTest method testPartitionConsumingWithFilterAndLimit.
@Test
public void testPartitionConsumingWithFilterAndLimit() throws Exception {
final PartitionedFileSet dataset = dsFrameworkUtil.getInstance(pfsInstance);
final TransactionAware txAwareDataset = (TransactionAware) dataset;
final Set<PartitionKey> partitionKeys1 = new HashSet<>();
for (int i = 0; i < 10; i++) {
partitionKeys1.add(generateUniqueKey());
}
final Set<PartitionKey> partitionKeys2 = new HashSet<>();
for (int i = 0; i < 15; i++) {
partitionKeys2.add(generateUniqueKey());
}
final PartitionConsumer partitionConsumer = new ConcurrentPartitionConsumer(dataset, new InMemoryStatePersistor());
// (consumption only happens at transaction borders)
for (final PartitionKey partitionKey : partitionKeys1) {
dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() throws Exception {
dataset.getPartitionOutput(partitionKey).addPartition();
}
});
}
dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() throws Exception {
// Initial consumption results in the partitions corresponding to partitionKeys1 to be consumed because only
// those partitions are added to the dataset at this point
List<Partition> consumedPartitions = new ArrayList<>();
// with limit = 1, the returned iterator is only size 1, even though there are more unconsumed partitions
Iterables.addAll(consumedPartitions, partitionConsumer.consumePartitions(1).getPartitions());
Assert.assertEquals(1, consumedPartitions.size());
// ask for 5 more
Iterables.addAll(consumedPartitions, partitionConsumer.consumePartitions(5).getPartitions());
Assert.assertEquals(6, consumedPartitions.size());
// ask for 5 more, but there are only 4 more unconsumed partitions (size of partitionKeys1 is 10).
Iterables.addAll(consumedPartitions, partitionConsumer.consumePartitions(5).getPartitions());
Assert.assertEquals(10, consumedPartitions.size());
Assert.assertEquals(partitionKeys1, toKeys(consumedPartitions));
}
});
dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() throws Exception {
for (PartitionKey partitionKey : partitionKeys2) {
dataset.getPartitionOutput(partitionKey).addPartition();
}
}
});
dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() throws Exception {
// using the same PartitionConsumer (which remembers the PartitionConsumerState) to consume additional
// partitions results in only the newly added partitions (corresponding to partitionKeys2) to be returned
Assert.assertEquals(partitionKeys2, toKeys(partitionConsumer.consumePartitions().getPartitions()));
}
});
dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() throws Exception {
// consuming the partitions again, without adding any new partitions returns an empty iterator
Assert.assertTrue(partitionConsumer.consumePartitions().getPartitions().isEmpty());
}
});
dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() throws Exception {
// creating a new PartitionConsumer resets the consumption state.
// test combination of filter and limit
// the partitionFilter will match partitionKeys [1, 7), of which there are 6
final PartitionFilter partitionFilter = PartitionFilter.builder().addRangeCondition("i", 1, 7).build();
final Predicate<PartitionDetail> predicate = new Predicate<PartitionDetail>() {
@Override
public boolean apply(PartitionDetail partitionDetail) {
return partitionFilter.match(partitionDetail.getPartitionKey());
}
};
ConsumerConfiguration configuration = ConsumerConfiguration.builder().setPartitionPredicate(predicate).build();
PartitionConsumer newPartitionConsumer = new ConcurrentPartitionConsumer(dataset, new InMemoryStatePersistor(), configuration);
List<Partition> consumedPartitions = new ArrayList<>();
// apply the filter (narrows it down to 6 elements) and apply a limit of 4 results in 4 consumed partitions
Iterables.addAll(consumedPartitions, newPartitionConsumer.consumePartitions(4).getPartitions());
Assert.assertEquals(4, consumedPartitions.size());
// apply a limit of 3, using the same filter returns the remaining 2 elements that fit that filter
Iterables.addAll(consumedPartitions, newPartitionConsumer.consumePartitions(3).getPartitions());
Assert.assertEquals(6, consumedPartitions.size());
// assert that the partitions returned have partition keys, where the i values range from [1, 7]
Set<Integer> expectedIFields = new HashSet<>();
for (int i = 1; i < 7; i++) {
expectedIFields.add(i);
}
Set<Integer> actualIFields = new HashSet<>();
for (Partition consumedPartition : consumedPartitions) {
actualIFields.add((Integer) consumedPartition.getPartitionKey().getField("i"));
}
Assert.assertEquals(expectedIFields, actualIFields);
}
});
}
use of co.cask.cdap.api.dataset.lib.PartitionDetail in project cdap by caskdata.
the class PartitionConsumerTest method testConsumeAfterDelete.
@Test
public void testConsumeAfterDelete() throws Exception {
final PartitionedFileSet dataset = dsFrameworkUtil.getInstance(pfsInstance);
final TransactionAware txAwareDataset = (TransactionAware) dataset;
final Set<PartitionKey> partitionKeys1 = new HashSet<>();
for (int i = 0; i < 3; i++) {
partitionKeys1.add(generateUniqueKey());
}
// need to ensure that our consumerConfiguration is larger than the amount we consume initially, so that
// additional partitions (which will be deleted afterwards) are brought into the working set
ConsumerConfiguration consumerConfiguration = ConsumerConfiguration.builder().setMaxWorkingSetSize(100).build();
final PartitionConsumer partitionConsumer = new ConcurrentPartitionConsumer(dataset, new InMemoryStatePersistor(), consumerConfiguration);
dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() throws Exception {
for (PartitionKey partitionKey : partitionKeys1) {
dataset.getPartitionOutput(partitionKey).addPartition();
}
}
});
dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() throws Exception {
// and not consumed
for (int i = 0; i < 2; i++) {
dataset.getPartitionOutput(generateUniqueKey()).addPartition();
}
}
});
dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() throws Exception {
// consume 3 of the 5 initial partitions
Assert.assertEquals(partitionKeys1, toKeys(partitionConsumer.consumePartitions(3).getPartitions()));
}
});
final Set<PartitionKey> partitionKeys2 = new HashSet<>();
for (int i = 0; i < 5; i++) {
partitionKeys2.add(generateUniqueKey());
}
dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() throws Exception {
// drop all existing partitions (2 of which are not consumed)
for (PartitionDetail partitionDetail : dataset.getPartitions(PartitionFilter.ALWAYS_MATCH)) {
dataset.dropPartition(partitionDetail.getPartitionKey());
}
// add 5 new ones
for (PartitionKey partitionKey : partitionKeys2) {
dataset.getPartitionOutput(partitionKey).addPartition();
}
}
});
dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() throws Exception {
// the consumed partition keys should correspond to partitionKeys2, and not include the dropped, but unconsumed
// partitions added before them
Assert.assertEquals(partitionKeys2, toKeys(partitionConsumer.consumePartitions().getPartitions()));
}
});
dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() throws Exception {
// consuming the partitions again, without adding any new partitions returns an empty iterator
Assert.assertTrue(partitionConsumer.consumePartitions().getPartitions().isEmpty());
}
});
dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() throws Exception {
// creating a new PartitionConsumer resets the consumption state. Consuming from it then returns an iterator
// with all the partition keys added after the deletions
ConcurrentPartitionConsumer partitionConsumer2 = new ConcurrentPartitionConsumer(dataset, new InMemoryStatePersistor());
Assert.assertEquals(partitionKeys2, toKeys(partitionConsumer2.consumePartitions().getPartitions()));
}
});
}
use of co.cask.cdap.api.dataset.lib.PartitionDetail in project cdap by caskdata.
the class PartitionBatchInput method setInput.
/**
* Used from the initialize method of the implementing batch job to configure as input a PartitionedFileSet that has
* specified a set of {@link Partition}s of a {@link PartitionedFileSet} to be processed by the run of the batch job.
* It does this by reading back the previous state, determining the new partitions to read, computing the new
* state, and persisting this new state. It then configures this dataset as input to the mapreduce context that is
* passed in.
*
* @param mapreduceContext MapReduce context used to access the PartitionedFileSet, and on which the input is
* configured
* @param partitionedFileSetName the name of the {@link PartitionedFileSet} to consume partitions from
* @param statePersistor a {@link DatasetStatePersistor} responsible for defining how the partition consumer state is
* managed
* @param consumerConfiguration defines parameters for the partition consumption
* @return a BatchPartitionCommitter used to persist the state of the partition consumer
*/
public static BatchPartitionCommitter setInput(MapReduceContext mapreduceContext, String partitionedFileSetName, DatasetStatePersistor statePersistor, ConsumerConfiguration consumerConfiguration) {
PartitionedFileSet partitionedFileSet = mapreduceContext.getDataset(partitionedFileSetName);
final PartitionConsumer partitionConsumer = new ConcurrentPartitionConsumer(partitionedFileSet, new DelegatingStatePersistor(mapreduceContext, statePersistor), consumerConfiguration);
final List<PartitionDetail> consumedPartitions = partitionConsumer.consumePartitions().getPartitions();
Map<String, String> arguments = new HashMap<>();
PartitionedFileSetArguments.addInputPartitions(arguments, consumedPartitions);
mapreduceContext.addInput(Input.ofDataset(partitionedFileSetName, arguments));
return new BatchPartitionCommitter() {
@Override
public void onFinish(boolean succeeded) {
partitionConsumer.onFinish(consumedPartitions, succeeded);
}
};
}
use of co.cask.cdap.api.dataset.lib.PartitionDetail in project cdap by caskdata.
the class PartitionedFileSetTest method testUpdateMetadata.
@Test
public void testUpdateMetadata() throws Exception {
final PartitionedFileSet dataset = dsFrameworkUtil.getInstance(pfsInstance);
dsFrameworkUtil.newTransactionExecutor((TransactionAware) dataset).execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() throws Exception {
PartitionOutput partitionOutput = dataset.getPartitionOutput(PARTITION_KEY);
ImmutableMap<String, String> originalEntries = ImmutableMap.of("key1", "value1");
partitionOutput.setMetadata(originalEntries);
partitionOutput.addPartition();
ImmutableMap<String, String> updatedMetadata = ImmutableMap.of("key2", "value2");
dataset.addMetadata(PARTITION_KEY, updatedMetadata);
PartitionDetail partitionDetail = dataset.getPartition(PARTITION_KEY);
Assert.assertNotNull(partitionDetail);
HashMap<String, String> combinedEntries = Maps.newHashMap();
combinedEntries.putAll(originalEntries);
combinedEntries.putAll(updatedMetadata);
Assert.assertEquals(combinedEntries, partitionDetail.getMetadata().asMap());
// adding an entry, for a key that already exists will throw an Exception
try {
dataset.addMetadata(PARTITION_KEY, "key2", "value3");
Assert.fail("Expected not to be able to update an existing metadata entry");
} catch (DataSetException expected) {
}
PartitionKey nonexistentPartitionKey = PartitionKey.builder().addIntField("i", 42).addLongField("l", 17L).addStringField("s", "nonexistent").build();
try {
// adding an entry, for a key that already exists will throw an Exception
dataset.addMetadata(nonexistentPartitionKey, "key2", "value3");
Assert.fail("Expected not to be able to add metadata for a nonexistent partition");
} catch (DataSetException expected) {
}
}
});
}
Aggregations