use of io.cdap.cdap.api.dataset.lib.PartitionDetail in project cdap by caskdata.
the class PartitionedFileSetDataset method consumePartitions.
// PartitionConsumerState consists of two things:
// 1) A list of transaction IDs representing the list of transactions in progress during the previous call.
// Each of these transaction IDs need to be checked for new partitions because there may be partitions created by
// those partitions since the previous call.
// 2) A transaction ID from which to start scanning for new partitions. This is an exclusive end range that the
// previous call stopped scanning partitions at.
// Note that each of the transactions IDs in (1) will be smaller than the transactionId in (2).
@ReadWrite
@Override
public PartitionConsumerResult consumePartitions(PartitionConsumerState partitionConsumerState, int limit, Predicate<PartitionDetail> predicate) {
List<Long> previousInProgress = partitionConsumerState.getVersionsToCheck();
Set<Long> noLongerInProgress = setDiff(previousInProgress, tx.getInProgress());
List<PartitionDetail> partitions = Lists.newArrayList();
Iterator<Long> iter = noLongerInProgress.iterator();
while (iter.hasNext()) {
Long txId = iter.next();
if (partitions.size() >= limit) {
break;
}
try (Scanner scanner = partitionsTable.readByIndex(WRITE_PTR_COL, Bytes.toBytes(txId))) {
scannerToPartitions(scanner, partitions, limit, predicate);
}
// remove the txIds as they are added to the partitions list already
// if they're not removed, they will be persisted in the state for the next scan
iter.remove();
}
// exclusive scan end, to be used as the start for a next call to consumePartitions
long scanUpTo;
if (partitions.size() < limit) {
// no read your own writes (partitions)
scanUpTo = Math.min(tx.getWritePointer(), tx.getReadPointer() + 1);
Long endTxId;
try (Scanner scanner = partitionsTable.scanByIndex(WRITE_PTR_COL, Bytes.toBytes(partitionConsumerState.getStartVersion()), Bytes.toBytes(scanUpTo))) {
endTxId = scannerToPartitions(scanner, partitions, limit, predicate);
}
if (endTxId != null) {
// nonnull means that the scanner was not exhausted
scanUpTo = endTxId;
}
} else {
// if we have already hit the limit, don't scan; instead, use the startVersion as the startVersion to the next
// call to consumePartitions
scanUpTo = partitionConsumerState.getStartVersion();
}
List<Long> inProgressBeforeScanEnd = Lists.newArrayList(noLongerInProgress);
for (long txId : tx.getInProgress()) {
if (txId >= scanUpTo) {
break;
}
inProgressBeforeScanEnd.add(txId);
}
return new PartitionConsumerResult(new PartitionConsumerState(scanUpTo, inProgressBeforeScanEnd), partitions);
}
use of io.cdap.cdap.api.dataset.lib.PartitionDetail in project cdap by caskdata.
the class PartitionedFileSetDataset method getInputFormatConfiguration.
@Override
public Map<String, String> getInputFormatConfiguration() {
Collection<PartitionKey> inputKeys = getInputKeys();
List<Location> inputLocations = new ArrayList<>(inputKeys.size());
Map<String, PartitionKey> pathToKey = new HashMap<>(inputKeys.size());
for (PartitionKey key : inputKeys) {
PartitionDetail partition = getPartition(key);
String path = Objects.requireNonNull(partition).getRelativePath();
Location partitionLocation = files.getLocation(path);
inputLocations.add(partitionLocation);
pathToKey.put(partitionLocation.toURI().toString(), key);
}
Map<String, String> inputFormatConfiguration = files.getInputFormatConfiguration(inputLocations);
inputFormatConfiguration.put(PATH_TO_PARTITIONING_MAPPING, GSON.toJson(pathToKey));
return inputFormatConfiguration;
}
use of io.cdap.cdap.api.dataset.lib.PartitionDetail in project cdap by caskdata.
the class PartitionBatchInput method setInput.
/**
* Used from the initialize method of the implementing batch job to configure as input a PartitionedFileSet that has
* specified a set of {@link Partition}s of a {@link PartitionedFileSet} to be processed by the run of the batch job.
* It does this by reading back the previous state, determining the new partitions to read, computing the new
* state, and persisting this new state. It then configures this dataset as input to the mapreduce context that is
* passed in.
*
* @param mapreduceContext MapReduce context used to access the PartitionedFileSet, and on which the input is
* configured
* @param partitionedFileSetName the name of the {@link PartitionedFileSet} to consume partitions from
* @param statePersistor a {@link DatasetStatePersistor} responsible for defining how the partition consumer state is
* managed
* @param consumerConfiguration defines parameters for the partition consumption
* @return a BatchPartitionCommitter used to persist the state of the partition consumer
*/
public static BatchPartitionCommitter setInput(MapReduceContext mapreduceContext, String partitionedFileSetName, DatasetStatePersistor statePersistor, ConsumerConfiguration consumerConfiguration) {
PartitionedFileSet partitionedFileSet = mapreduceContext.getDataset(partitionedFileSetName);
final PartitionConsumer partitionConsumer = new ConcurrentPartitionConsumer(partitionedFileSet, new DelegatingStatePersistor(mapreduceContext, statePersistor), consumerConfiguration);
final List<PartitionDetail> consumedPartitions = partitionConsumer.consumePartitions().getPartitions();
Map<String, String> arguments = new HashMap<>();
PartitionedFileSetArguments.addInputPartitions(arguments, consumedPartitions);
mapreduceContext.addInput(Input.ofDataset(partitionedFileSetName, arguments));
return succeeded -> partitionConsumer.onFinish(consumedPartitions, succeeded);
}
use of io.cdap.cdap.api.dataset.lib.PartitionDetail in project cdap by caskdata.
the class ConsumerWorkingSet method populate.
/**
* Populates the ConsumerWorkingSet by fetching partitions from the given PartitionedFileSet.
*
* @param partitionedFileSet the PartitionedFileSet to fetch partitions from
* @param configuration the ConsumerConfiguration which defines parameters for consuming
*/
public void populate(PartitionedFileSet partitionedFileSet, ConsumerConfiguration configuration) {
int numToPopulate = configuration.getMaxWorkingSetSize() - partitions.size();
Predicate<PartitionDetail> predicate = configuration.getPartitionPredicate();
io.cdap.cdap.api.dataset.lib.PartitionConsumerResult result = partitionedFileSet.consumePartitions(partitionConsumerState, numToPopulate, predicate);
List<PartitionDetail> partitions = result.getPartitions();
for (PartitionDetail partition : partitions) {
addPartition(partition.getPartitionKey());
}
partitionConsumerState = result.getPartitionConsumerState();
}
use of io.cdap.cdap.api.dataset.lib.PartitionDetail in project cdap by caskdata.
the class ConcurrentPartitionConsumer method selectPartitions.
private List<PartitionDetail> selectPartitions(PartitionAcceptor acceptor, ConsumerWorkingSet workingSet) {
long now = System.currentTimeMillis();
List<PartitionDetail> toConsume = new ArrayList<>();
Iterator<ConsumablePartition> iter = workingSet.getPartitions().iterator();
while (iter.hasNext()) {
ConsumablePartition consumablePartition = iter.next();
if (ProcessState.AVAILABLE != consumablePartition.getProcessState()) {
continue;
}
PartitionDetail partition = getPartitionedFileSet().getPartition(consumablePartition.getPartitionKey());
if (partition == null) {
// no longer exists, so skip it and remove it from the working set
iter.remove();
continue;
}
PartitionAcceptor.Return accept = acceptor.accept(partition);
switch(accept) {
case ACCEPT:
consumablePartition.take();
consumablePartition.setTimestamp(now);
toConsume.add(partition);
continue;
case SKIP:
continue;
case STOP:
return toConsume;
}
}
return toConsume;
}
Aggregations