use of co.cask.cdap.api.dataset.lib.PartitionConsumerResult in project cdap by caskdata.
the class PartitionedFileSetDataset method consumePartitions.
// PartitionConsumerState consists of two things:
// 1) A list of transaction IDs representing the list of transactions in progress during the previous call.
// Each of these transaction IDs need to be checked for new partitions because there may be partitions created by
// those partitions since the previous call.
// 2) A transaction ID from which to start scanning for new partitions. This is an exclusive end range that the
// previous call stopped scanning partitions at.
// Note that each of the transactions IDs in (1) will be smaller than the transactionId in (2).
@ReadWrite
@Override
public PartitionConsumerResult consumePartitions(PartitionConsumerState partitionConsumerState, int limit, Predicate<PartitionDetail> predicate) {
List<Long> previousInProgress = partitionConsumerState.getVersionsToCheck();
Set<Long> noLongerInProgress = setDiff(previousInProgress, tx.getInProgress());
List<PartitionDetail> partitions = Lists.newArrayList();
Iterator<Long> iter = noLongerInProgress.iterator();
while (iter.hasNext()) {
Long txId = iter.next();
if (partitions.size() >= limit) {
break;
}
try (Scanner scanner = partitionsTable.readByIndex(WRITE_PTR_COL, Bytes.toBytes(txId))) {
scannerToPartitions(scanner, partitions, limit, predicate);
}
// remove the txIds as they are added to the partitions list already
// if they're not removed, they will be persisted in the state for the next scan
iter.remove();
}
// exclusive scan end, to be used as the start for a next call to consumePartitions
long scanUpTo;
if (partitions.size() < limit) {
// no read your own writes (partitions)
scanUpTo = Math.min(tx.getWritePointer(), tx.getReadPointer() + 1);
Long endTxId;
try (Scanner scanner = partitionsTable.scanByIndex(WRITE_PTR_COL, Bytes.toBytes(partitionConsumerState.getStartVersion()), Bytes.toBytes(scanUpTo))) {
endTxId = scannerToPartitions(scanner, partitions, limit, predicate);
}
if (endTxId != null) {
// nonnull means that the scanner was not exhausted
scanUpTo = endTxId;
}
} else {
// if we have already hit the limit, don't scan; instead, use the startVersion as the startVersion to the next
// call to consumePartitions
scanUpTo = partitionConsumerState.getStartVersion();
}
List<Long> inProgressBeforeScanEnd = Lists.newArrayList(noLongerInProgress);
for (long txId : tx.getInProgress()) {
if (txId >= scanUpTo) {
break;
}
inProgressBeforeScanEnd.add(txId);
}
return new PartitionConsumerResult(new PartitionConsumerState(scanUpTo, inProgressBeforeScanEnd), partitions);
}
use of co.cask.cdap.api.dataset.lib.PartitionConsumerResult in project cdap by caskdata.
the class SimplePartitionConsumer method consumePartitions.
/**
* @param limit limit to be applied while consuming partitions
* @param predicate predicate to be applied while consuming partitions
* @return a list of {@link Partition}s of the underlying {@link PartitionedFileSet} created since the last call
* to this method. This excludes partitions created in in-progress transactions including the one in which the
* call to this method is made.
*/
public List<PartitionDetail> consumePartitions(int limit, Predicate<PartitionDetail> predicate) {
PartitionConsumerResult partitionConsumerResult = partitionedFileSet.consumePartitions(partitionConsumerState, limit, predicate);
partitionConsumerState = partitionConsumerResult.getPartitionConsumerState();
return partitionConsumerResult.getPartitions();
}
Aggregations