use of co.cask.cdap.api.dataset.lib.PartitionConsumerState in project cdap by caskdata.
the class PartitionedFileSetDataset method consumePartitions.
// PartitionConsumerState consists of two things:
// 1) A list of transaction IDs representing the list of transactions in progress during the previous call.
// Each of these transaction IDs need to be checked for new partitions because there may be partitions created by
// those partitions since the previous call.
// 2) A transaction ID from which to start scanning for new partitions. This is an exclusive end range that the
// previous call stopped scanning partitions at.
// Note that each of the transactions IDs in (1) will be smaller than the transactionId in (2).
@ReadWrite
@Override
public PartitionConsumerResult consumePartitions(PartitionConsumerState partitionConsumerState, int limit, Predicate<PartitionDetail> predicate) {
List<Long> previousInProgress = partitionConsumerState.getVersionsToCheck();
Set<Long> noLongerInProgress = setDiff(previousInProgress, tx.getInProgress());
List<PartitionDetail> partitions = Lists.newArrayList();
Iterator<Long> iter = noLongerInProgress.iterator();
while (iter.hasNext()) {
Long txId = iter.next();
if (partitions.size() >= limit) {
break;
}
try (Scanner scanner = partitionsTable.readByIndex(WRITE_PTR_COL, Bytes.toBytes(txId))) {
scannerToPartitions(scanner, partitions, limit, predicate);
}
// remove the txIds as they are added to the partitions list already
// if they're not removed, they will be persisted in the state for the next scan
iter.remove();
}
// exclusive scan end, to be used as the start for a next call to consumePartitions
long scanUpTo;
if (partitions.size() < limit) {
// no read your own writes (partitions)
scanUpTo = Math.min(tx.getWritePointer(), tx.getReadPointer() + 1);
Long endTxId;
try (Scanner scanner = partitionsTable.scanByIndex(WRITE_PTR_COL, Bytes.toBytes(partitionConsumerState.getStartVersion()), Bytes.toBytes(scanUpTo))) {
endTxId = scannerToPartitions(scanner, partitions, limit, predicate);
}
if (endTxId != null) {
// nonnull means that the scanner was not exhausted
scanUpTo = endTxId;
}
} else {
// if we have already hit the limit, don't scan; instead, use the startVersion as the startVersion to the next
// call to consumePartitions
scanUpTo = partitionConsumerState.getStartVersion();
}
List<Long> inProgressBeforeScanEnd = Lists.newArrayList(noLongerInProgress);
for (long txId : tx.getInProgress()) {
if (txId >= scanUpTo) {
break;
}
inProgressBeforeScanEnd.add(txId);
}
return new PartitionConsumerResult(new PartitionConsumerState(scanUpTo, inProgressBeforeScanEnd), partitions);
}
use of co.cask.cdap.api.dataset.lib.PartitionConsumerState in project cdap by caskdata.
the class PartitionConsumerStateTest method testByteSerialization.
@Test
public void testByteSerialization() {
testSerDe(new PartitionConsumerState(2L, Lists.newArrayList(1L, 2L, 3L)));
testSerDe(new PartitionConsumerState(0L, Lists.newArrayList(3L, 5L, 100L, 61L, 12L)));
testSerDe(new PartitionConsumerState(Long.MAX_VALUE, Lists.<Long>newArrayList()));
}
use of co.cask.cdap.api.dataset.lib.PartitionConsumerState in project cdap by caskdata.
the class PartitionConsumerStateTest method testSerDe.
private void testSerDe(PartitionConsumerState stateToSerialize) {
byte[] bytes = stateToSerialize.toBytes();
// Assert that the serialization format version is 0
Assert.assertEquals(0, bytes[0]);
PartitionConsumerState deserializedState = PartitionConsumerState.fromBytes(bytes);
Assert.assertEquals(stateToSerialize, deserializedState);
}
Aggregations