Search in sources :

Example 16 with PartitionKey

use of io.cdap.cdap.api.dataset.lib.PartitionKey in project cdap by caskdata.

the class ConcurrentPartitionConsumer method abort.

/**
 * Resets the process state of the given partition keys, as they were not successfully processed, or discards the
 * partition if it has already been attempted the configured number of attempts.
 */
protected void abort(ConsumerWorkingSet workingSet, List<? extends PartitionKey> partitionKeys) {
    List<PartitionKey> discardedPartitions = new ArrayList<>();
    for (PartitionKey key : partitionKeys) {
        ConsumablePartition consumablePartition = workingSet.lookup(key);
        assertInProgress(consumablePartition);
        // either reset its processState, or remove it from the workingSet, depending on how many tries it already has
        if (consumablePartition.getNumFailures() < getConfiguration().getMaxRetries()) {
            consumablePartition.retry();
        } else {
            discardedPartitions.add(key);
            workingSet.lookup(key).discard();
        }
    }
    if (!discardedPartitions.isEmpty()) {
        LOG.warn("Discarded keys due to being retried {} times: {}", getConfiguration().getMaxRetries(), discardedPartitions);
    }
}
Also used : ArrayList(java.util.ArrayList) PartitionKey(io.cdap.cdap.api.dataset.lib.PartitionKey)

Example 17 with PartitionKey

use of io.cdap.cdap.api.dataset.lib.PartitionKey in project cdap by caskdata.

the class MultiWriter method write.

@Override
public void write(K key, V value) throws IOException, InterruptedException {
    PartitionKey partitionKey = dynamicPartitioner.getPartitionKey(key, value);
    RecordWriter<K, V> rw = this.recordWriters.get(partitionKey);
    if (rw == null) {
        // if we don't have the record writer yet for the final path, create one and add it to the cache
        TaskAttemptContext taskAttemptContext = getKeySpecificContext(partitionKey);
        rw = getBaseRecordWriter(taskAttemptContext);
        this.recordWriters.put(partitionKey, rw);
        this.contexts.put(partitionKey, taskAttemptContext);
    }
    rw.write(key, value);
}
Also used : PartitionKey(io.cdap.cdap.api.dataset.lib.PartitionKey) TaskAttemptContext(org.apache.hadoop.mapreduce.TaskAttemptContext)

Example 18 with PartitionKey

use of io.cdap.cdap.api.dataset.lib.PartitionKey in project cdap by caskdata.

the class MultiWriter method close.

@Override
public void close(TaskAttemptContext context) throws IOException, InterruptedException {
    try {
        Map<PartitionKey, RecordWriter<?, ?>> recordWriters = new HashMap<>();
        recordWriters.putAll(this.recordWriters);
        MultipleOutputs.closeRecordWriters(recordWriters, contexts);
        taskContext.flushOperations();
    } catch (Exception e) {
        throw new IOException(e);
    } finally {
        dynamicPartitioner.destroy();
    }
}
Also used : RecordWriter(org.apache.hadoop.mapreduce.RecordWriter) HashMap(java.util.HashMap) PartitionKey(io.cdap.cdap.api.dataset.lib.PartitionKey) IOException(java.io.IOException) IOException(java.io.IOException)

Example 19 with PartitionKey

use of io.cdap.cdap.api.dataset.lib.PartitionKey in project cdap by caskdata.

the class SingleWriter method write.

@Override
public void write(K key, V value) throws IOException, InterruptedException {
    PartitionKey partitionKey = dynamicPartitioner.getPartitionKey(key, value);
    if (!partitionKey.equals(currPartitionKey)) {
        // make sure we haven't written to this partition previously
        if (closedKeys.contains(partitionKey)) {
            throw new IllegalStateException(String.format("Encountered a partition key for which the writer has already been closed: '%s'.", partitionKey));
        }
        // currPartitionKey can be null for the first key value pair, in which case there's no writer to close
        if (currPartitionKey != null) {
            // close the existing RecordWriter and create a new one for the new PartitionKEy
            currRecordWriter.close(currContext);
            closedKeys.add(currPartitionKey);
        }
        currPartitionKey = partitionKey;
        currContext = getKeySpecificContext(currPartitionKey);
        currRecordWriter = getBaseRecordWriter(currContext);
    }
    currRecordWriter.write(key, value);
}
Also used : PartitionKey(io.cdap.cdap.api.dataset.lib.PartitionKey)

Example 20 with PartitionKey

use of io.cdap.cdap.api.dataset.lib.PartitionKey in project cdap by caskdata.

the class PartitionConsumerTest method testNumRetries.

@Test
public void testNumRetries() throws Exception {
    final PartitionedFileSet dataset = dsFrameworkUtil.getInstance(pfsInstance);
    final TransactionAware txAwareDataset = (TransactionAware) dataset;
    final int numRetries = 1;
    ConsumerConfiguration configuration = ConsumerConfiguration.builder().setMaxRetries(numRetries).build();
    final PartitionConsumer partitionConsumer = new ConcurrentPartitionConsumer(dataset, new InMemoryStatePersistor(), configuration);
    final PartitionKey partitionKey = generateUniqueKey();
    dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() throws Exception {
            dataset.getPartitionOutput(partitionKey).addPartition();
        }
    });
    dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() throws Exception {
            // from the working set
            for (int i = 0; i < numRetries + 1; i++) {
                List<PartitionDetail> partitionDetails = partitionConsumer.consumePartitions(1).getPartitions();
                Assert.assertEquals(1, partitionDetails.size());
                Assert.assertEquals(partitionKey, partitionDetails.get(0).getPartitionKey());
                // aborting the processing of the partition
                partitionConsumer.onFinish(partitionDetails, false);
            }
            // after the 2nd abort, the partition is discarded entirely, and so no partitions are available for consuming
            PartitionConsumerResult result = partitionConsumer.consumePartitions(1);
            Assert.assertEquals(0, result.getPartitions().size());
            Assert.assertEquals(1, result.getFailedPartitions().size());
            Assert.assertEquals(partitionKey, result.getFailedPartitions().get(0).getPartitionKey());
        }
    });
}
Also used : ConcurrentPartitionConsumer(io.cdap.cdap.api.dataset.lib.partitioned.ConcurrentPartitionConsumer) PartitionConsumerResult(io.cdap.cdap.api.dataset.lib.partitioned.PartitionConsumerResult) PartitionedFileSet(io.cdap.cdap.api.dataset.lib.PartitionedFileSet) TransactionExecutor(org.apache.tephra.TransactionExecutor) TransactionAware(org.apache.tephra.TransactionAware) ConsumerConfiguration(io.cdap.cdap.api.dataset.lib.partitioned.ConsumerConfiguration) PartitionKey(io.cdap.cdap.api.dataset.lib.PartitionKey) ArrayList(java.util.ArrayList) ImmutableList(com.google.common.collect.ImmutableList) List(java.util.List) PartitionConsumer(io.cdap.cdap.api.dataset.lib.partitioned.PartitionConsumer) ConcurrentPartitionConsumer(io.cdap.cdap.api.dataset.lib.partitioned.ConcurrentPartitionConsumer) Test(org.junit.Test)

Aggregations

PartitionKey (io.cdap.cdap.api.dataset.lib.PartitionKey)121 Test (org.junit.Test)55 PartitionedFileSet (io.cdap.cdap.api.dataset.lib.PartitionedFileSet)53 TransactionAware (org.apache.tephra.TransactionAware)34 TransactionExecutor (org.apache.tephra.TransactionExecutor)34 IOException (java.io.IOException)26 PartitionDetail (io.cdap.cdap.api.dataset.lib.PartitionDetail)23 ConcurrentPartitionConsumer (io.cdap.cdap.api.dataset.lib.partitioned.ConcurrentPartitionConsumer)22 PartitionConsumer (io.cdap.cdap.api.dataset.lib.partitioned.PartitionConsumer)22 ArrayList (java.util.ArrayList)22 List (java.util.List)22 HashMap (java.util.HashMap)21 ImmutableList (com.google.common.collect.ImmutableList)18 DataSetException (io.cdap.cdap.api.dataset.DataSetException)18 HashSet (java.util.HashSet)18 Partition (io.cdap.cdap.api.dataset.lib.Partition)14 ConsumerConfiguration (io.cdap.cdap.api.dataset.lib.partitioned.ConsumerConfiguration)14 DatasetId (io.cdap.cdap.proto.id.DatasetId)14 Map (java.util.Map)14 Location (org.apache.twill.filesystem.Location)14