Search in sources :

Example 11 with PartitionKey

use of io.cdap.cdap.api.dataset.lib.PartitionKey in project cdap by caskdata.

the class PartitionedFileSetDataset method getPartitions.

private void getPartitions(@Nullable PartitionFilter filter, PartitionConsumer consumer, boolean decodeMetadata, @Nullable byte[] startKey, @Nullable byte[] endKey, long limit) {
    long count = 0L;
    try (Scanner scanner = partitionsTable.scan(startKey, endKey)) {
        while (count < limit) {
            Row row = scanner.next();
            if (row == null) {
                break;
            }
            PartitionKey key;
            try {
                key = parseRowKey(row.getRow(), partitioning);
            } catch (IllegalArgumentException e) {
                LOG.debug(String.format("Failed to parse row key for partitioned file set '%s': %s", getName(), Bytes.toStringBinary(row.getRow())));
                continue;
            }
            if (filter != null && !filter.match(key)) {
                continue;
            }
            byte[] pathBytes = row.get(RELATIVE_PATH);
            if (pathBytes != null) {
                consumer.consume(key, Bytes.toString(pathBytes), decodeMetadata ? metadataFromRow(row) : null);
            }
            count++;
        }
        if (count == 0) {
            warnIfInvalidPartitionFilter(filter, partitioning);
        }
    }
}
Also used : Scanner(io.cdap.cdap.api.dataset.table.Scanner) PartitionKey(io.cdap.cdap.api.dataset.lib.PartitionKey) Row(io.cdap.cdap.api.dataset.table.Row)

Example 12 with PartitionKey

use of io.cdap.cdap.api.dataset.lib.PartitionKey in project cdap by caskdata.

the class PartitionedFileSetDefinition method updateArgumentsIfNeeded.

// if the arguments do not contain an output location, generate one from the partition key (if present)
protected static Map<String, String> updateArgumentsIfNeeded(Map<String, String> arguments, Partitioning partitioning) {
    if (FileSetArguments.getOutputPath(arguments) == null) {
        PartitionKey key = PartitionedFileSetArguments.getOutputPartitionKey(arguments, partitioning);
        // we need to copy the map, to avoid modifying the passed-in map
        arguments = Maps.newHashMap(arguments);
        if (key != null) {
            FileSetArguments.setOutputPath(arguments, PartitionedFileSetDataset.getOutputPath(key, partitioning));
        } else if (PartitionedFileSetArguments.getDynamicPartitioner(arguments) != null) {
            // when using DynamicPartitioner, use the baseLocation of the fileSet as the output location
            FileSetArguments.setBaseOutputPath(arguments);
        }
    }
    return arguments;
}
Also used : PartitionKey(io.cdap.cdap.api.dataset.lib.PartitionKey)

Example 13 with PartitionKey

use of io.cdap.cdap.api.dataset.lib.PartitionKey in project cdap by caskdata.

the class PartitionedFileSetDataset method onSuccess.

@Override
public void onSuccess() throws DataSetException {
    String outputPath = FileSetArguments.getOutputPath(runtimeArguments);
    // Either way, we can't do much here.
    if (outputPath == null) {
        return;
    }
    // its possible that there is no output key, if using the DynamicPartitioner, in which case
    // DynamicPartitioningOutputFormat is responsible for registering the partitions and the metadata
    PartitionKey outputKey = PartitionedFileSetArguments.getOutputPartitionKey(runtimeArguments, getPartitioning());
    if (outputKey != null) {
        Map<String, String> metadata = PartitionedFileSetArguments.getOutputPartitionMetadata(runtimeArguments);
        addPartition(outputKey, outputPath, metadata, true, false);
    }
    // currently, FileSetDataset#onSuccess is a no-op, but call it, in case it does something in the future
    ((FileSetDataset) files).onSuccess();
}
Also used : FileSetDataset(io.cdap.cdap.data2.dataset2.lib.file.FileSetDataset) PartitionKey(io.cdap.cdap.api.dataset.lib.PartitionKey)

Example 14 with PartitionKey

use of io.cdap.cdap.api.dataset.lib.PartitionKey in project cdap by caskdata.

the class PartitionedFileSetDataset method getOutputFormatConfiguration.

@Override
public Map<String, String> getOutputFormatConfiguration() {
    checkNotExternal();
    // copy the output properties of the embedded file set to the output arguments
    Map<String, String> outputArgs = new HashMap<>(files.getOutputFormatConfiguration());
    // we set the file set's output path in the definition's getDataset(), so there is no need to configure it again.
    // here we just want to validate that an output partition key or dynamic partitioner was specified in the arguments.
    PartitionKey outputKey = PartitionedFileSetArguments.getOutputPartitionKey(runtimeArguments, getPartitioning());
    if (outputKey == null) {
        String dynamicPartitionerClassName = PartitionedFileSetArguments.getDynamicPartitioner(runtimeArguments);
        if (dynamicPartitionerClassName == null) {
            throw new DataSetException("Either a Partition key or a DynamicPartitioner class must be given as a runtime argument.");
        }
        copyDynamicPartitionerArguments(runtimeArguments, outputArgs);
        outputArgs.put(Constants.Dataset.Partitioned.HCONF_ATTR_OUTPUT_FORMAT_CLASS_NAME, files.getOutputFormatClassName());
        outputArgs.put(Constants.Dataset.Partitioned.HCONF_ATTR_OUTPUT_DATASET, getName());
    } else {
        assertNotExists(outputKey, true);
    }
    return ImmutableMap.copyOf(outputArgs);
}
Also used : DataSetException(io.cdap.cdap.api.dataset.DataSetException) HashMap(java.util.HashMap) PartitionKey(io.cdap.cdap.api.dataset.lib.PartitionKey)

Example 15 with PartitionKey

use of io.cdap.cdap.api.dataset.lib.PartitionKey in project cdap by caskdata.

the class TimePartitionedFileSetDataset method getPartitionOutput.

@Override
public TimePartitionOutput getPartitionOutput(long time) {
    if (isExternal) {
        throw new UnsupportedOperationException("Output is not supported for external time-partitioned file set '" + spec.getName() + "'");
    }
    PartitionKey key = partitionKeyForTime(time);
    assertNotExists(key, true);
    return new BasicTimePartitionOutput(this, getOutputPath(key), key);
}
Also used : PartitionKey(io.cdap.cdap.api.dataset.lib.PartitionKey)

Aggregations

PartitionKey (io.cdap.cdap.api.dataset.lib.PartitionKey)121 Test (org.junit.Test)55 PartitionedFileSet (io.cdap.cdap.api.dataset.lib.PartitionedFileSet)53 TransactionAware (org.apache.tephra.TransactionAware)34 TransactionExecutor (org.apache.tephra.TransactionExecutor)34 IOException (java.io.IOException)26 PartitionDetail (io.cdap.cdap.api.dataset.lib.PartitionDetail)23 ConcurrentPartitionConsumer (io.cdap.cdap.api.dataset.lib.partitioned.ConcurrentPartitionConsumer)22 PartitionConsumer (io.cdap.cdap.api.dataset.lib.partitioned.PartitionConsumer)22 ArrayList (java.util.ArrayList)22 List (java.util.List)22 HashMap (java.util.HashMap)21 ImmutableList (com.google.common.collect.ImmutableList)18 DataSetException (io.cdap.cdap.api.dataset.DataSetException)18 HashSet (java.util.HashSet)18 Partition (io.cdap.cdap.api.dataset.lib.Partition)14 ConsumerConfiguration (io.cdap.cdap.api.dataset.lib.partitioned.ConsumerConfiguration)14 DatasetId (io.cdap.cdap.proto.id.DatasetId)14 Map (java.util.Map)14 Location (org.apache.twill.filesystem.Location)14