Examples with PartitionKey - co.cask.cdap.api.dataset.lib.PartitionKey

Example 56 with PartitionKey

use of co.cask.cdap.api.dataset.lib.PartitionKey in project cdap by caskdata.

the class PartitionedFileSetDataset method scannerToPartitions.

/**
   * While applying a partition filter and a limit, parse partitions from the rows of a scanner and add them to a list.
   * Note that multiple partitions can have the same transaction write pointer. For each set of partitions with the same
   * write pointer, we either add the entire set or exclude the entire set. The limit is applied after adding each such
   * set of partitions to the list.
   *
   * @param scanner the scanner on the partitions table from which to read partitions
   * @param partitions list to add the qualifying partitions to
   * @param limit limit, which once reached, partitions committed by other transactions will not be added.
   *              The limit is checked after adding consuming all partitions of a transaction, so
   *              the total number of consumed partitions may be greater than this limit.
   * @param predicate predicate to apply before adding to the partitions list
   * @return Transaction ID of the partition that we reached in the scanner, but did not add to the list. This value
   *         can be useful in future scans.
   */
@Nullable
private Long scannerToPartitions(Scanner scanner, List<PartitionDetail> partitions, int limit, Predicate<PartitionDetail> predicate) {
    Long prevTxId = null;
    Row row;
    while ((row = scanner.next()) != null) {
        PartitionKey key = parseRowKey(row.getRow(), partitioning);
        String relativePath = Bytes.toString(row.get(RELATIVE_PATH));
        Long txId = Bytes.toLong(row.get(WRITE_PTR_COL));
        // by a transaction or none, since we keep our marker based upon transaction id.
        if (prevTxId != null && !prevTxId.equals(txId)) {
            if (partitions.size() >= limit) {
                return txId;
            }
        }
        prevTxId = txId;
        BasicPartitionDetail partitionDetail = new BasicPartitionDetail(PartitionedFileSetDataset.this, relativePath, key, metadataFromRow(row));
        if (!predicate.apply(partitionDetail)) {
            continue;
        }
        partitions.add(partitionDetail);
    }
    return null;
}

Also used : AtomicLong(java.util.concurrent.atomic.AtomicLong) PartitionKey(co.cask.cdap.api.dataset.lib.PartitionKey) Row(co.cask.cdap.api.dataset.table.Row) Nullable(javax.annotation.Nullable)

Example 57 with PartitionKey

use of co.cask.cdap.api.dataset.lib.PartitionKey in project cdap by caskdata.

the class TimePartitionedFileSetDefinition method updateArgumentsIfNeeded.

// if the arguments do not contain an output path, but an output partition time, generate an output path from that;
// also convert the output partition time to a partition key and add it to the arguments;
// also call the super class' method to update arguments if it needs to
protected Map<String, String> updateArgumentsIfNeeded(Map<String, String> arguments) {
    Long time = TimePartitionedFileSetArguments.getOutputPartitionTime(arguments);
    if (time != null) {
        // set the output path according to partition time
        if (FileSetArguments.getOutputPath(arguments) == null) {
            String outputPathFormat = TimePartitionedFileSetArguments.getOutputPathFormat(arguments);
            String path;
            if (Strings.isNullOrEmpty(outputPathFormat)) {
                path = String.format("%tF/%tH-%tM.%d", time, time, time, time);
            } else {
                SimpleDateFormat format = new SimpleDateFormat(outputPathFormat);
                String timeZoneID = TimePartitionedFileSetArguments.getOutputPathTimeZone(arguments);
                if (!Strings.isNullOrEmpty(timeZoneID)) {
                    format.setTimeZone(TimeZone.getTimeZone(timeZoneID));
                }
                path = format.format(new Date(time));
            }
            arguments = Maps.newHashMap(arguments);
            FileSetArguments.setOutputPath(arguments, path);
        }
        // add the corresponding partition key to the arguments
        PartitionKey outputKey = TimePartitionedFileSetDataset.partitionKeyForTime(time);
        PartitionedFileSetArguments.setOutputPartitionKey(arguments, outputKey);
    }
    // delegate to super class for anything it needs to do
    return updateArgumentsIfNeeded(arguments, TimePartitionedFileSetDataset.PARTITIONING);
}

Also used : PartitionKey(co.cask.cdap.api.dataset.lib.PartitionKey) SimpleDateFormat(java.text.SimpleDateFormat) Date(java.util.Date)

Example 58 with PartitionKey

use of co.cask.cdap.api.dataset.lib.PartitionKey in project cdap by caskdata.

the class PartitionedFileSetDefinition method updateArgumentsIfNeeded.

// if the arguments do not contain an output location, generate one from the partition key (if present)
protected static Map<String, String> updateArgumentsIfNeeded(Map<String, String> arguments, Partitioning partitioning) {
    if (FileSetArguments.getOutputPath(arguments) == null) {
        PartitionKey key = PartitionedFileSetArguments.getOutputPartitionKey(arguments, partitioning);
        if (key != null) {
            arguments = Maps.newHashMap(arguments);
            FileSetArguments.setOutputPath(arguments, PartitionedFileSetDataset.getOutputPath(key, partitioning));
        } else if (PartitionedFileSetArguments.getDynamicPartitioner(arguments) != null) {
            // when using DynamicPartitioner, use the baseLocation of the fileSet as the output location
            FileSetArguments.setBaseOutputPath(arguments);
        }
    }
    return arguments;
}

Also used : PartitionKey(co.cask.cdap.api.dataset.lib.PartitionKey)

Example 59 with PartitionKey

use of co.cask.cdap.api.dataset.lib.PartitionKey in project cdap by caskdata.

the class ExploreExecutorHttpHandler method doAddPartition.

private void doAddPartition(HttpRequest request, HttpResponder responder, DatasetId datasetId) {
    Dataset dataset;
    try (SystemDatasetInstantiator datasetInstantiator = datasetInstantiatorFactory.create()) {
        dataset = datasetInstantiator.getDataset(datasetId);
        if (dataset == null) {
            responder.sendString(HttpResponseStatus.NOT_FOUND, "Cannot load dataset " + datasetId);
            return;
        }
    } catch (IOException e) {
        String classNotFoundMessage = isClassNotFoundException(e);
        if (classNotFoundMessage != null) {
            JsonObject json = new JsonObject();
            json.addProperty("handle", QueryHandle.NO_OP.getHandle());
            responder.sendJson(HttpResponseStatus.OK, json);
            return;
        }
        LOG.error("Exception instantiating dataset {}.", datasetId, e);
        responder.sendString(HttpResponseStatus.INTERNAL_SERVER_ERROR, "Exception instantiating dataset " + datasetId.getDataset());
        return;
    }
    try {
        if (!(dataset instanceof PartitionedFileSet)) {
            responder.sendString(HttpResponseStatus.BAD_REQUEST, "not a partitioned dataset.");
            return;
        }
        Partitioning partitioning = ((PartitionedFileSet) dataset).getPartitioning();
        Reader reader = new InputStreamReader(new ChannelBufferInputStream(request.getContent()));
        Map<String, String> properties = GSON.fromJson(reader, new TypeToken<Map<String, String>>() {
        }.getType());
        String fsPath = properties.get("path");
        if (fsPath == null) {
            responder.sendString(HttpResponseStatus.BAD_REQUEST, "path was not specified.");
            return;
        }
        PartitionKey partitionKey;
        try {
            partitionKey = PartitionedFileSetArguments.getOutputPartitionKey(properties, partitioning);
        } catch (Exception e) {
            responder.sendString(HttpResponseStatus.BAD_REQUEST, "invalid partition key: " + e.getMessage());
            return;
        }
        if (partitionKey == null) {
            responder.sendString(HttpResponseStatus.BAD_REQUEST, "no partition key was given.");
            return;
        }
        QueryHandle handle = exploreTableManager.addPartition(datasetId, properties, partitionKey, fsPath);
        JsonObject json = new JsonObject();
        json.addProperty("handle", handle.getHandle());
        responder.sendJson(HttpResponseStatus.OK, json);
    } catch (Throwable e) {
        LOG.error("Got exception:", e);
        responder.sendString(HttpResponseStatus.INTERNAL_SERVER_ERROR, e.getMessage());
    }
}

Also used : InputStreamReader(java.io.InputStreamReader) Dataset(co.cask.cdap.api.dataset.Dataset) JsonObject(com.google.gson.JsonObject) Reader(java.io.Reader) InputStreamReader(java.io.InputStreamReader) PartitionedFileSet(co.cask.cdap.api.dataset.lib.PartitionedFileSet) IOException(java.io.IOException) BadRequestException(co.cask.cdap.common.BadRequestException) ExploreException(co.cask.cdap.explore.service.ExploreException) SQLException(java.sql.SQLException) DatasetManagementException(co.cask.cdap.api.dataset.DatasetManagementException) JsonSyntaxException(com.google.gson.JsonSyntaxException) UnsupportedTypeException(co.cask.cdap.api.data.schema.UnsupportedTypeException) IOException(java.io.IOException) Partitioning(co.cask.cdap.api.dataset.lib.Partitioning) SystemDatasetInstantiator(co.cask.cdap.data.dataset.SystemDatasetInstantiator) TypeToken(com.google.common.reflect.TypeToken) PartitionKey(co.cask.cdap.api.dataset.lib.PartitionKey) ChannelBufferInputStream(org.jboss.netty.buffer.ChannelBufferInputStream) QueryHandle(co.cask.cdap.proto.QueryHandle)

Aggregations

PartitionKey (co.cask.cdap.api.dataset.lib.PartitionKey)59 PartitionedFileSet (co.cask.cdap.api.dataset.lib.PartitionedFileSet)28 Test (org.junit.Test)27 TransactionAware (org.apache.tephra.TransactionAware)17 TransactionExecutor (org.apache.tephra.TransactionExecutor)17 IOException (java.io.IOException)12 HashMap (java.util.HashMap)12 PartitionDetail (co.cask.cdap.api.dataset.lib.PartitionDetail)11 ConcurrentPartitionConsumer (co.cask.cdap.api.dataset.lib.partitioned.ConcurrentPartitionConsumer)11 PartitionConsumer (co.cask.cdap.api.dataset.lib.partitioned.PartitionConsumer)11 ArrayList (java.util.ArrayList)11 List (java.util.List)11 HashSet (java.util.HashSet)10 DataSetException (co.cask.cdap.api.dataset.DataSetException)9 ImmutableList (com.google.common.collect.ImmutableList)9 PartitionNotFoundException (co.cask.cdap.api.dataset.PartitionNotFoundException)7 Partition (co.cask.cdap.api.dataset.lib.Partition)7 ConsumerConfiguration (co.cask.cdap.api.dataset.lib.partitioned.ConsumerConfiguration)7 TimePartitionedFileSet (co.cask.cdap.api.dataset.lib.TimePartitionedFileSet)6 Location (org.apache.twill.filesystem.Location)6