Search in sources :

Example 1 with PartitionDetail

use of io.cdap.cdap.api.dataset.lib.PartitionDetail in project cdap by caskdata.

the class PartitionedFileSetDataset method consumePartitions.

// PartitionConsumerState consists of two things:
// 1) A list of transaction IDs representing the list of transactions in progress during the previous call.
// Each of these transaction IDs need to be checked for new partitions because there may be partitions created by
// those partitions since the previous call.
// 2) A transaction ID from which to start scanning for new partitions. This is an exclusive end range that the
// previous call stopped scanning partitions at.
// Note that each of the transactions IDs in (1) will be smaller than the transactionId in (2).
@ReadWrite
@Override
public PartitionConsumerResult consumePartitions(PartitionConsumerState partitionConsumerState, int limit, Predicate<PartitionDetail> predicate) {
    List<Long> previousInProgress = partitionConsumerState.getVersionsToCheck();
    Set<Long> noLongerInProgress = setDiff(previousInProgress, tx.getInProgress());
    List<PartitionDetail> partitions = Lists.newArrayList();
    Iterator<Long> iter = noLongerInProgress.iterator();
    while (iter.hasNext()) {
        Long txId = iter.next();
        if (partitions.size() >= limit) {
            break;
        }
        try (Scanner scanner = partitionsTable.readByIndex(WRITE_PTR_COL, Bytes.toBytes(txId))) {
            scannerToPartitions(scanner, partitions, limit, predicate);
        }
        // remove the txIds as they are added to the partitions list already
        // if they're not removed, they will be persisted in the state for the next scan
        iter.remove();
    }
    // exclusive scan end, to be used as the start for a next call to consumePartitions
    long scanUpTo;
    if (partitions.size() < limit) {
        // no read your own writes (partitions)
        scanUpTo = Math.min(tx.getWritePointer(), tx.getReadPointer() + 1);
        Long endTxId;
        try (Scanner scanner = partitionsTable.scanByIndex(WRITE_PTR_COL, Bytes.toBytes(partitionConsumerState.getStartVersion()), Bytes.toBytes(scanUpTo))) {
            endTxId = scannerToPartitions(scanner, partitions, limit, predicate);
        }
        if (endTxId != null) {
            // nonnull means that the scanner was not exhausted
            scanUpTo = endTxId;
        }
    } else {
        // if we have already hit the limit, don't scan; instead, use the startVersion as the startVersion to the next
        // call to consumePartitions
        scanUpTo = partitionConsumerState.getStartVersion();
    }
    List<Long> inProgressBeforeScanEnd = Lists.newArrayList(noLongerInProgress);
    for (long txId : tx.getInProgress()) {
        if (txId >= scanUpTo) {
            break;
        }
        inProgressBeforeScanEnd.add(txId);
    }
    return new PartitionConsumerResult(new PartitionConsumerState(scanUpTo, inProgressBeforeScanEnd), partitions);
}
Also used : Scanner(io.cdap.cdap.api.dataset.table.Scanner) PartitionConsumerResult(io.cdap.cdap.api.dataset.lib.PartitionConsumerResult) PartitionConsumerState(io.cdap.cdap.api.dataset.lib.PartitionConsumerState) AtomicLong(java.util.concurrent.atomic.AtomicLong) PartitionDetail(io.cdap.cdap.api.dataset.lib.PartitionDetail) ReadWrite(io.cdap.cdap.api.annotation.ReadWrite)

Example 2 with PartitionDetail

use of io.cdap.cdap.api.dataset.lib.PartitionDetail in project cdap by caskdata.

the class PartitionedFileSetDataset method getInputFormatConfiguration.

@Override
public Map<String, String> getInputFormatConfiguration() {
    Collection<PartitionKey> inputKeys = getInputKeys();
    List<Location> inputLocations = new ArrayList<>(inputKeys.size());
    Map<String, PartitionKey> pathToKey = new HashMap<>(inputKeys.size());
    for (PartitionKey key : inputKeys) {
        PartitionDetail partition = getPartition(key);
        String path = Objects.requireNonNull(partition).getRelativePath();
        Location partitionLocation = files.getLocation(path);
        inputLocations.add(partitionLocation);
        pathToKey.put(partitionLocation.toURI().toString(), key);
    }
    Map<String, String> inputFormatConfiguration = files.getInputFormatConfiguration(inputLocations);
    inputFormatConfiguration.put(PATH_TO_PARTITIONING_MAPPING, GSON.toJson(pathToKey));
    return inputFormatConfiguration;
}
Also used : HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) PartitionKey(io.cdap.cdap.api.dataset.lib.PartitionKey) PartitionDetail(io.cdap.cdap.api.dataset.lib.PartitionDetail) Location(org.apache.twill.filesystem.Location)

Example 3 with PartitionDetail

use of io.cdap.cdap.api.dataset.lib.PartitionDetail in project cdap by caskdata.

the class PartitionBatchInput method setInput.

/**
 * Used from the initialize method of the implementing batch job to configure as input a PartitionedFileSet that has
 * specified a set of {@link Partition}s of a {@link PartitionedFileSet} to be processed by the run of the batch job.
 * It does this by reading back the previous state, determining the new partitions to read, computing the new
 * state, and persisting this new state. It then configures this dataset as input to the mapreduce context that is
 * passed in.
 *
 * @param mapreduceContext MapReduce context used to access the PartitionedFileSet, and on which the input is
 *                         configured
 * @param partitionedFileSetName the name of the {@link PartitionedFileSet} to consume partitions from
 * @param statePersistor a {@link DatasetStatePersistor} responsible for defining how the partition consumer state is
 *                       managed
 * @param consumerConfiguration defines parameters for the partition consumption
 * @return a BatchPartitionCommitter used to persist the state of the partition consumer
 */
public static BatchPartitionCommitter setInput(MapReduceContext mapreduceContext, String partitionedFileSetName, DatasetStatePersistor statePersistor, ConsumerConfiguration consumerConfiguration) {
    PartitionedFileSet partitionedFileSet = mapreduceContext.getDataset(partitionedFileSetName);
    final PartitionConsumer partitionConsumer = new ConcurrentPartitionConsumer(partitionedFileSet, new DelegatingStatePersistor(mapreduceContext, statePersistor), consumerConfiguration);
    final List<PartitionDetail> consumedPartitions = partitionConsumer.consumePartitions().getPartitions();
    Map<String, String> arguments = new HashMap<>();
    PartitionedFileSetArguments.addInputPartitions(arguments, consumedPartitions);
    mapreduceContext.addInput(Input.ofDataset(partitionedFileSetName, arguments));
    return succeeded -> partitionConsumer.onFinish(consumedPartitions, succeeded);
}
Also used : Input(io.cdap.cdap.api.data.batch.Input) DatasetStatePersistor(io.cdap.cdap.api.dataset.lib.DatasetStatePersistor) PartitionDetail(io.cdap.cdap.api.dataset.lib.PartitionDetail) List(java.util.List) PartitionedFileSetArguments(io.cdap.cdap.api.dataset.lib.PartitionedFileSetArguments) Beta(io.cdap.cdap.api.annotation.Beta) Map(java.util.Map) Partition(io.cdap.cdap.api.dataset.lib.Partition) PartitionedFileSet(io.cdap.cdap.api.dataset.lib.PartitionedFileSet) HashMap(java.util.HashMap) MapReduceContext(io.cdap.cdap.api.mapreduce.MapReduceContext) HashMap(java.util.HashMap) PartitionedFileSet(io.cdap.cdap.api.dataset.lib.PartitionedFileSet) PartitionDetail(io.cdap.cdap.api.dataset.lib.PartitionDetail)

Example 4 with PartitionDetail

use of io.cdap.cdap.api.dataset.lib.PartitionDetail in project cdap by caskdata.

the class ConsumerWorkingSet method populate.

/**
 * Populates the ConsumerWorkingSet by fetching partitions from the given PartitionedFileSet.
 *
 * @param partitionedFileSet the PartitionedFileSet to fetch partitions from
 * @param configuration the ConsumerConfiguration which defines parameters for consuming
 */
public void populate(PartitionedFileSet partitionedFileSet, ConsumerConfiguration configuration) {
    int numToPopulate = configuration.getMaxWorkingSetSize() - partitions.size();
    Predicate<PartitionDetail> predicate = configuration.getPartitionPredicate();
    io.cdap.cdap.api.dataset.lib.PartitionConsumerResult result = partitionedFileSet.consumePartitions(partitionConsumerState, numToPopulate, predicate);
    List<PartitionDetail> partitions = result.getPartitions();
    for (PartitionDetail partition : partitions) {
        addPartition(partition.getPartitionKey());
    }
    partitionConsumerState = result.getPartitionConsumerState();
}
Also used : PartitionDetail(io.cdap.cdap.api.dataset.lib.PartitionDetail)

Example 5 with PartitionDetail

use of io.cdap.cdap.api.dataset.lib.PartitionDetail in project cdap by caskdata.

the class ConcurrentPartitionConsumer method selectPartitions.

private List<PartitionDetail> selectPartitions(PartitionAcceptor acceptor, ConsumerWorkingSet workingSet) {
    long now = System.currentTimeMillis();
    List<PartitionDetail> toConsume = new ArrayList<>();
    Iterator<ConsumablePartition> iter = workingSet.getPartitions().iterator();
    while (iter.hasNext()) {
        ConsumablePartition consumablePartition = iter.next();
        if (ProcessState.AVAILABLE != consumablePartition.getProcessState()) {
            continue;
        }
        PartitionDetail partition = getPartitionedFileSet().getPartition(consumablePartition.getPartitionKey());
        if (partition == null) {
            // no longer exists, so skip it and remove it from the working set
            iter.remove();
            continue;
        }
        PartitionAcceptor.Return accept = acceptor.accept(partition);
        switch(accept) {
            case ACCEPT:
                consumablePartition.take();
                consumablePartition.setTimestamp(now);
                toConsume.add(partition);
                continue;
            case SKIP:
                continue;
            case STOP:
                return toConsume;
        }
    }
    return toConsume;
}
Also used : ArrayList(java.util.ArrayList) PartitionDetail(io.cdap.cdap.api.dataset.lib.PartitionDetail)

Aggregations

PartitionDetail (io.cdap.cdap.api.dataset.lib.PartitionDetail)21 PartitionedFileSet (io.cdap.cdap.api.dataset.lib.PartitionedFileSet)14 PartitionKey (io.cdap.cdap.api.dataset.lib.PartitionKey)11 Test (org.junit.Test)10 TransactionAware (org.apache.tephra.TransactionAware)9 TransactionExecutor (org.apache.tephra.TransactionExecutor)9 DataSetException (io.cdap.cdap.api.dataset.DataSetException)6 IOException (java.io.IOException)6 HashMap (java.util.HashMap)6 PartitionNotFoundException (io.cdap.cdap.api.dataset.PartitionNotFoundException)5 PartitionAlreadyExistsException (io.cdap.cdap.api.dataset.lib.PartitionAlreadyExistsException)5 PartitionOutput (io.cdap.cdap.api.dataset.lib.PartitionOutput)5 HashSet (java.util.HashSet)5 Location (org.apache.twill.filesystem.Location)5 ArrayList (java.util.ArrayList)4 List (java.util.List)4 ImmutableMap (com.google.common.collect.ImmutableMap)3 Predicate (io.cdap.cdap.api.Predicate)3 Partition (io.cdap.cdap.api.dataset.lib.Partition)3 PartitionFilter (io.cdap.cdap.api.dataset.lib.PartitionFilter)3