Search in sources :

Example 61 with PartitionKey

use of co.cask.cdap.api.dataset.lib.PartitionKey in project cdap by caskdata.

the class PartitionedFileSetDataset method fixPartitions.

/**
 * This method can bring a partitioned file set in sync with explore. It scans the partition table and adds
 * every partition to explore. It will start multiple transactions, processing a batch of partitions in each
 * transaction. Optionally, it can disable and re-enable explore first, that is, drop and recreate the Hive table.
 * @param transactional the Transactional for executing transactions
 * @param datasetName the name of the dataset to fix
 * @param doDisable whether to disable and re-enable explore first
 * @param partitionsPerTx how many partitions to process per transaction
 * @param verbose whether to log verbosely. If true, this will log a message for every partition; otherwise it
 *                will only log a report of how many partitions were added / could not be added.
 */
@Beta
@SuppressWarnings("unused")
public static void fixPartitions(Transactional transactional, final String datasetName, boolean doDisable, final int partitionsPerTx, final boolean verbose) {
    if (doDisable) {
        try {
            transactional.execute(new TxRunnable() {

                @Override
                public void run(co.cask.cdap.api.data.DatasetContext context) throws Exception {
                    PartitionedFileSetDataset pfs = context.getDataset(datasetName);
                    pfs.disableExplore();
                    // truncating = true, because this is like truncating
                    pfs.enableExplore(true);
                }
            });
        } catch (TransactionFailureException e) {
            throw new DataSetException("Unable to disable and enable Explore", e.getCause());
        } catch (RuntimeException e) {
            if (e.getCause() instanceof TransactionFailureException) {
                throw new DataSetException("Unable to disable and enable Explore", e.getCause().getCause());
            }
            throw e;
        }
    }
    final AtomicReference<PartitionKey> startKey = new AtomicReference<>();
    final AtomicLong errorCount = new AtomicLong(0L);
    final AtomicLong successCount = new AtomicLong(0L);
    do {
        try {
            transactional.execute(new TxRunnable() {

                @Override
                public void run(co.cask.cdap.api.data.DatasetContext context) throws Exception {
                    final PartitionedFileSetDataset pfs = context.getDataset(datasetName);
                    // compute start row for the scan, reset remembered start key to null
                    byte[] startRow = startKey.get() == null ? null : generateRowKey(startKey.get(), pfs.getPartitioning());
                    startKey.set(null);
                    PartitionConsumer consumer = new PartitionConsumer() {

                        int count = 0;

                        @Override
                        public void consume(PartitionKey key, String path, @Nullable PartitionMetadata metadata) {
                            if (count >= partitionsPerTx) {
                                // reached the limit: remember this key as the start for the next round
                                startKey.set(key);
                                return;
                            }
                            try {
                                pfs.addPartitionToExplore(key, path);
                                successCount.incrementAndGet();
                                if (verbose) {
                                    LOG.info("Added partition {} with path {}", key, path);
                                }
                            } catch (DataSetException e) {
                                errorCount.incrementAndGet();
                                if (verbose) {
                                    LOG.warn(e.getMessage(), e);
                                }
                            }
                            count++;
                        }
                    };
                    pfs.getPartitions(null, consumer, false, startRow, null, partitionsPerTx + 1);
                }
            });
        } catch (TransactionConflictException e) {
            throw new DataSetException("Transaction conflict while reading partitions. This should never happen. " + "Make sure that no other programs are using this dataset at the same time.");
        } catch (TransactionFailureException e) {
            throw new DataSetException("Transaction failure: " + e.getMessage(), e.getCause());
        } catch (RuntimeException e) {
            // this looks like duplication but is needed in case this is run from a worker: see CDAP-6837
            if (e.getCause() instanceof TransactionConflictException) {
                throw new DataSetException("Transaction conflict while reading partitions. This should never happen. " + "Make sure that no other programs are using this dataset at the same time.");
            } else if (e.getCause() instanceof TransactionFailureException) {
                throw new DataSetException("Transaction failure: " + e.getMessage(), e.getCause().getCause());
            } else {
                throw e;
            }
        }
    } while (// if it is null, then we consumed less than the limit in this round -> done
    startKey.get() != null);
    LOG.info("Added {} partitions, failed to add {} partitions.", successCount.get(), errorCount.get());
}
Also used : PartitionMetadata(co.cask.cdap.api.dataset.lib.PartitionMetadata) TransactionConflictException(org.apache.tephra.TransactionConflictException) AtomicReference(java.util.concurrent.atomic.AtomicReference) TransactionFailureException(org.apache.tephra.TransactionFailureException) PartitionNotFoundException(co.cask.cdap.api.dataset.PartitionNotFoundException) TransactionConflictException(org.apache.tephra.TransactionConflictException) PartitionAlreadyExistsException(co.cask.cdap.api.dataset.lib.PartitionAlreadyExistsException) IOException(java.io.IOException) DataSetException(co.cask.cdap.api.dataset.DataSetException) TransactionFailureException(org.apache.tephra.TransactionFailureException) AtomicLong(java.util.concurrent.atomic.AtomicLong) DataSetException(co.cask.cdap.api.dataset.DataSetException) TxRunnable(co.cask.cdap.api.TxRunnable) PartitionKey(co.cask.cdap.api.dataset.lib.PartitionKey) PartitionMetadata(co.cask.cdap.api.dataset.lib.PartitionMetadata) Beta(co.cask.cdap.api.annotation.Beta)

Example 62 with PartitionKey

use of co.cask.cdap.api.dataset.lib.PartitionKey in project cdap by caskdata.

the class PartitionedFileSetDataset method getInputFormatConfiguration.

@Override
public Map<String, String> getInputFormatConfiguration() {
    Collection<PartitionKey> inputKeys = getInputKeys();
    List<Location> inputLocations = new ArrayList<>(inputKeys.size());
    Map<String, PartitionKey> pathToKey = new HashMap<>(inputKeys.size());
    for (PartitionKey key : inputKeys) {
        PartitionDetail partition = getPartition(key);
        String path = Objects.requireNonNull(partition).getRelativePath();
        Location partitionLocation = files.getLocation(path);
        inputLocations.add(partitionLocation);
        pathToKey.put(partitionLocation.toURI().toString(), key);
    }
    Map<String, String> inputFormatConfiguration = files.getInputFormatConfiguration(inputLocations);
    inputFormatConfiguration.put(PATH_TO_PARTITIONING_MAPPING, GSON.toJson(pathToKey));
    return inputFormatConfiguration;
}
Also used : HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) PartitionKey(co.cask.cdap.api.dataset.lib.PartitionKey) PartitionDetail(co.cask.cdap.api.dataset.lib.PartitionDetail) Location(org.apache.twill.filesystem.Location)

Example 63 with PartitionKey

use of co.cask.cdap.api.dataset.lib.PartitionKey in project cdap by caskdata.

the class PartitionedFileSetDefinition method updateArgumentsIfNeeded.

// if the arguments do not contain an output location, generate one from the partition key (if present)
protected static Map<String, String> updateArgumentsIfNeeded(Map<String, String> arguments, Partitioning partitioning) {
    if (FileSetArguments.getOutputPath(arguments) == null) {
        PartitionKey key = PartitionedFileSetArguments.getOutputPartitionKey(arguments, partitioning);
        // we need to copy the map, to avoid modifying the passed-in map
        arguments = Maps.newHashMap(arguments);
        if (key != null) {
            FileSetArguments.setOutputPath(arguments, PartitionedFileSetDataset.getOutputPath(key, partitioning));
        } else if (PartitionedFileSetArguments.getDynamicPartitioner(arguments) != null) {
            // when using DynamicPartitioner, use the baseLocation of the fileSet as the output location
            FileSetArguments.setBaseOutputPath(arguments);
        }
    }
    return arguments;
}
Also used : PartitionKey(co.cask.cdap.api.dataset.lib.PartitionKey)

Example 64 with PartitionKey

use of co.cask.cdap.api.dataset.lib.PartitionKey in project cdap by caskdata.

the class TimePartitionedFileSetDataset method getPartitionOutput.

@Override
public TimePartitionOutput getPartitionOutput(long time) {
    if (isExternal) {
        throw new UnsupportedOperationException("Output is not supported for external time-partitioned file set '" + spec.getName() + "'");
    }
    PartitionKey key = partitionKeyForTime(time);
    assertNotExists(key, true);
    return new BasicTimePartitionOutput(this, getOutputPath(key), key);
}
Also used : PartitionKey(co.cask.cdap.api.dataset.lib.PartitionKey)

Example 65 with PartitionKey

use of co.cask.cdap.api.dataset.lib.PartitionKey in project cdap by caskdata.

the class TimePartitionedFileSetDefinition method updateArgumentsIfNeeded.

// if the arguments do not contain an output path, but an output partition time, generate an output path from that;
// also convert the output partition time to a partition key and add it to the arguments;
// also call the super class' method to update arguments if it needs to
protected Map<String, String> updateArgumentsIfNeeded(Map<String, String> arguments) {
    Long time = TimePartitionedFileSetArguments.getOutputPartitionTime(arguments);
    if (time != null) {
        // set the output path according to partition time
        if (FileSetArguments.getOutputPath(arguments) == null) {
            String outputPathFormat = TimePartitionedFileSetArguments.getOutputPathFormat(arguments);
            String path;
            if (Strings.isNullOrEmpty(outputPathFormat)) {
                path = String.format("%tF/%tH-%tM.%d", time, time, time, time);
            } else {
                SimpleDateFormat format = new SimpleDateFormat(outputPathFormat);
                String timeZoneID = TimePartitionedFileSetArguments.getOutputPathTimeZone(arguments);
                if (!Strings.isNullOrEmpty(timeZoneID)) {
                    format.setTimeZone(TimeZone.getTimeZone(timeZoneID));
                }
                path = format.format(new Date(time));
            }
            arguments = Maps.newHashMap(arguments);
            FileSetArguments.setOutputPath(arguments, path);
        }
        // add the corresponding partition key to the arguments
        PartitionKey outputKey = TimePartitionedFileSetDataset.partitionKeyForTime(time);
        PartitionedFileSetArguments.setOutputPartitionKey(arguments, outputKey);
    }
    // delegate to super class for anything it needs to do
    return updateArgumentsIfNeeded(arguments, TimePartitionedFileSetDataset.PARTITIONING);
}
Also used : PartitionKey(co.cask.cdap.api.dataset.lib.PartitionKey) SimpleDateFormat(java.text.SimpleDateFormat) Date(java.util.Date)

Aggregations

PartitionKey (co.cask.cdap.api.dataset.lib.PartitionKey)70 PartitionedFileSet (co.cask.cdap.api.dataset.lib.PartitionedFileSet)32 Test (org.junit.Test)28 TransactionAware (org.apache.tephra.TransactionAware)19 TransactionExecutor (org.apache.tephra.TransactionExecutor)19 IOException (java.io.IOException)16 HashMap (java.util.HashMap)15 PartitionDetail (co.cask.cdap.api.dataset.lib.PartitionDetail)12 ConcurrentPartitionConsumer (co.cask.cdap.api.dataset.lib.partitioned.ConcurrentPartitionConsumer)11 PartitionConsumer (co.cask.cdap.api.dataset.lib.partitioned.PartitionConsumer)11 ArrayList (java.util.ArrayList)11 List (java.util.List)11 HashSet (java.util.HashSet)10 DataSetException (co.cask.cdap.api.dataset.DataSetException)9 ImmutableList (com.google.common.collect.ImmutableList)9 Partition (co.cask.cdap.api.dataset.lib.Partition)8 Map (java.util.Map)8 Location (org.apache.twill.filesystem.Location)8 PartitionNotFoundException (co.cask.cdap.api.dataset.PartitionNotFoundException)7 ConsumerConfiguration (co.cask.cdap.api.dataset.lib.partitioned.ConsumerConfiguration)7