Search in sources :

Example 21 with PartitionKey

use of co.cask.cdap.api.dataset.lib.PartitionKey in project cdap by caskdata.

the class MultiWriter method write.

public void write(K key, V value) throws IOException, InterruptedException {
    PartitionKey partitionKey = dynamicPartitioner.getPartitionKey(key, value);
    RecordWriter<K, V> rw = this.recordWriters.get(partitionKey);
    if (rw == null) {
        // if we don't have the record writer yet for the final path, create one and add it to the cache
        TaskAttemptContext taskAttemptContext = getKeySpecificContext(partitionKey);
        rw = getBaseRecordWriter(taskAttemptContext);
        this.recordWriters.put(partitionKey, rw);
        this.contexts.put(partitionKey, taskAttemptContext);
    }
    rw.write(key, value);
}
Also used : PartitionKey(co.cask.cdap.api.dataset.lib.PartitionKey) TaskAttemptContext(org.apache.hadoop.mapreduce.TaskAttemptContext)

Example 22 with PartitionKey

use of co.cask.cdap.api.dataset.lib.PartitionKey in project cdap by caskdata.

the class MultiWriter method close.

@Override
public void close(TaskAttemptContext context) throws IOException, InterruptedException {
    try {
        Map<PartitionKey, RecordWriter<?, ?>> recordWriters = new HashMap<>();
        recordWriters.putAll(this.recordWriters);
        MultipleOutputs.closeRecordWriters(recordWriters, contexts);
        taskContext.flushOperations();
    } catch (Exception e) {
        throw new IOException(e);
    } finally {
        dynamicPartitioner.destroy();
    }
}
Also used : RecordWriter(org.apache.hadoop.mapreduce.RecordWriter) HashMap(java.util.HashMap) PartitionKey(co.cask.cdap.api.dataset.lib.PartitionKey) IOException(java.io.IOException) IOException(java.io.IOException)

Example 23 with PartitionKey

use of co.cask.cdap.api.dataset.lib.PartitionKey in project cdap by caskdata.

the class ConnectorSink method prepareRun.

@Override
public void prepareRun(BatchSinkContext context) throws Exception {
    Map<String, String> arguments = new HashMap<>();
    PartitionKey outputPartition = PartitionKey.builder().addStringField("phase", phaseName).build();
    PartitionedFileSetArguments.setOutputPartitionKey(arguments, outputPartition);
    context.addOutput(datasetName, arguments);
}
Also used : HashMap(java.util.HashMap) PartitionKey(co.cask.cdap.api.dataset.lib.PartitionKey)

Example 24 with PartitionKey

use of co.cask.cdap.api.dataset.lib.PartitionKey in project cdap by caskdata.

the class TimePartitionedFileSetDataset method partitionFiltersForTimeRange.

// returns a list of partition filters that cover that specified time range.
// this may return a list with a single null filter (in case the range is unbounded in both directions)
@VisibleForTesting
static List<PartitionFilter> partitionFiltersForTimeRange(long startTime, long endTime) {
    // unsatisfiable range
    if (startTime >= endTime) {
        return Collections.emptyList();
    }
    PartitionKey keyLower = startTime <= 0 ? null : partitionKeyForTime(startTime);
    PartitionKey keyUpper = endTime == Long.MAX_VALUE ? null : partitionKeyForTime(endTime);
    // no bounds -> no filter
    if (keyLower == null && keyUpper == null) {
        // no filter needed to select all time
        return Collections.singletonList(null);
    }
    List<PartitionFilter> filters = Lists.newArrayList();
    String[] allFields = PARTITIONING.getFields().keySet().toArray(new String[PARTITIONING.getFields().size()]);
    // if there is no lower bound, we only need the filters for the upper bound
    if (keyLower == null) {
        addUpperFilters(allFields, 0, keyUpper, filters, initialSupplier());
        return filters;
    }
    // if there is no upper bound, we only need the filters for the lower bound
    if (keyUpper == null) {
        addLowerFilters(allFields, 0, keyLower, filters, initialSupplier());
        return filters;
    }
    return filtersFor(allFields, 0, keyLower, keyUpper, filters, initialSupplier());
}
Also used : PartitionFilter(co.cask.cdap.api.dataset.lib.PartitionFilter) PartitionKey(co.cask.cdap.api.dataset.lib.PartitionKey) VisibleForTesting(com.google.common.annotations.VisibleForTesting)

Example 25 with PartitionKey

use of co.cask.cdap.api.dataset.lib.PartitionKey in project cdap by caskdata.

the class PartitionedFileSetDataset method getOutputFormatConfiguration.

@Override
public Map<String, String> getOutputFormatConfiguration() {
    if (isExternal) {
        throw new UnsupportedOperationException("Output is not supported for external partitioned file set '" + spec.getName() + "'");
    }
    // copy the output properties of the embedded file set to the output arguments
    Map<String, String> outputArgs = new HashMap<>(files.getOutputFormatConfiguration());
    // we set the file set's output path in the definition's getDataset(), so there is no need to configure it again.
    // here we just want to validate that an output partition key or dynamic partitioner was specified in the arguments.
    PartitionKey outputKey = PartitionedFileSetArguments.getOutputPartitionKey(runtimeArguments, getPartitioning());
    if (outputKey == null) {
        String dynamicPartitionerClassName = PartitionedFileSetArguments.getDynamicPartitioner(runtimeArguments);
        if (dynamicPartitionerClassName == null) {
            throw new DataSetException("Either a Partition key or a DynamicPartitioner class must be given as a runtime argument.");
        }
        // propagate output metadata into OutputFormatConfiguration so DynamicPartitionerOutputCommitter can assign
        // the metadata when it creates the partitions
        Map<String, String> outputMetadata = PartitionedFileSetArguments.getOutputPartitionMetadata(runtimeArguments);
        PartitionedFileSetArguments.setOutputPartitionMetadata(outputArgs, outputMetadata);
        PartitionedFileSetArguments.setDynamicPartitioner(outputArgs, dynamicPartitionerClassName);
        PartitionedFileSetArguments.setDynamicPartitionerConcurrency(outputArgs, PartitionedFileSetArguments.isDynamicPartitionerConcurrencyAllowed(runtimeArguments));
        outputArgs.put(Constants.Dataset.Partitioned.HCONF_ATTR_OUTPUT_FORMAT_CLASS_NAME, files.getOutputFormatClassName());
        outputArgs.put(Constants.Dataset.Partitioned.HCONF_ATTR_OUTPUT_DATASET, getName());
    }
    return ImmutableMap.copyOf(outputArgs);
}
Also used : DataSetException(co.cask.cdap.api.dataset.DataSetException) HashMap(java.util.HashMap) PartitionKey(co.cask.cdap.api.dataset.lib.PartitionKey)

Aggregations

PartitionKey (co.cask.cdap.api.dataset.lib.PartitionKey)59 PartitionedFileSet (co.cask.cdap.api.dataset.lib.PartitionedFileSet)28 Test (org.junit.Test)27 TransactionAware (org.apache.tephra.TransactionAware)17 TransactionExecutor (org.apache.tephra.TransactionExecutor)17 IOException (java.io.IOException)12 HashMap (java.util.HashMap)12 PartitionDetail (co.cask.cdap.api.dataset.lib.PartitionDetail)11 ConcurrentPartitionConsumer (co.cask.cdap.api.dataset.lib.partitioned.ConcurrentPartitionConsumer)11 PartitionConsumer (co.cask.cdap.api.dataset.lib.partitioned.PartitionConsumer)11 ArrayList (java.util.ArrayList)11 List (java.util.List)11 HashSet (java.util.HashSet)10 DataSetException (co.cask.cdap.api.dataset.DataSetException)9 ImmutableList (com.google.common.collect.ImmutableList)9 PartitionNotFoundException (co.cask.cdap.api.dataset.PartitionNotFoundException)7 Partition (co.cask.cdap.api.dataset.lib.Partition)7 ConsumerConfiguration (co.cask.cdap.api.dataset.lib.partitioned.ConsumerConfiguration)7 TimePartitionedFileSet (co.cask.cdap.api.dataset.lib.TimePartitionedFileSet)6 Location (org.apache.twill.filesystem.Location)6