use of co.cask.cdap.api.dataset.lib.PartitionKey in project cdap by caskdata.
the class PartitionedFileSetDataset method fixPartitions.
/**
* This method can bring a partitioned file set in sync with explore. It scans the partition table and adds
* every partition to explore. It will start multiple transactions, processing a batch of partitions in each
* transaction. Optionally, it can disable and re-enable explore first, that is, drop and recreate the Hive table.
* @param transactional the Transactional for executing transactions
* @param datasetName the name of the dataset to fix
* @param doDisable whether to disable and re-enable explore first
* @param partitionsPerTx how many partitions to process per transaction
* @param verbose whether to log verbosely. If true, this will log a message for every partition; otherwise it
* will only log a report of how many partitions were added / could not be added.
*/
@Beta
@SuppressWarnings("unused")
public static void fixPartitions(Transactional transactional, final String datasetName, boolean doDisable, final int partitionsPerTx, final boolean verbose) {
if (doDisable) {
try {
transactional.execute(new TxRunnable() {
@Override
public void run(co.cask.cdap.api.data.DatasetContext context) throws Exception {
PartitionedFileSetDataset pfs = context.getDataset(datasetName);
pfs.disableExplore();
// truncating = true, because this is like truncating
pfs.enableExplore(true);
}
});
} catch (TransactionFailureException e) {
throw new DataSetException("Unable to disable and enable Explore", e.getCause());
} catch (RuntimeException e) {
if (e.getCause() instanceof TransactionFailureException) {
throw new DataSetException("Unable to disable and enable Explore", e.getCause().getCause());
}
throw e;
}
}
final AtomicReference<PartitionKey> startKey = new AtomicReference<>();
final AtomicLong errorCount = new AtomicLong(0L);
final AtomicLong successCount = new AtomicLong(0L);
do {
try {
transactional.execute(new TxRunnable() {
@Override
public void run(co.cask.cdap.api.data.DatasetContext context) throws Exception {
final PartitionedFileSetDataset pfs = context.getDataset(datasetName);
// compute start row for the scan, reset remembered start key to null
byte[] startRow = startKey.get() == null ? null : generateRowKey(startKey.get(), pfs.getPartitioning());
startKey.set(null);
PartitionConsumer consumer = new PartitionConsumer() {
int count = 0;
@Override
public void consume(PartitionKey key, String path, @Nullable PartitionMetadata metadata) {
if (count >= partitionsPerTx) {
// reached the limit: remember this key as the start for the next round
startKey.set(key);
return;
}
try {
pfs.addPartitionToExplore(key, path);
successCount.incrementAndGet();
if (verbose) {
LOG.info("Added partition {} with path {}", key, path);
}
} catch (DataSetException e) {
errorCount.incrementAndGet();
if (verbose) {
LOG.warn(e.getMessage(), e);
}
}
count++;
}
};
pfs.getPartitions(null, consumer, false, startRow, null, partitionsPerTx + 1);
}
});
} catch (TransactionConflictException e) {
throw new DataSetException("Transaction conflict while reading partitions. This should never happen. " + "Make sure that no other programs are using this dataset at the same time.");
} catch (TransactionFailureException e) {
throw new DataSetException("Transaction failure: " + e.getMessage(), e.getCause());
} catch (RuntimeException e) {
// this looks like duplication but is needed in case this is run from a worker: see CDAP-6837
if (e.getCause() instanceof TransactionConflictException) {
throw new DataSetException("Transaction conflict while reading partitions. This should never happen. " + "Make sure that no other programs are using this dataset at the same time.");
} else if (e.getCause() instanceof TransactionFailureException) {
throw new DataSetException("Transaction failure: " + e.getMessage(), e.getCause().getCause());
} else {
throw e;
}
}
} while (// if it is null, then we consumed less than the limit in this round -> done
startKey.get() != null);
LOG.info("Added {} partitions, failed to add {} partitions.", successCount.get(), errorCount.get());
}
use of co.cask.cdap.api.dataset.lib.PartitionKey in project cdap by caskdata.
the class PartitionedFileSetDataset method getInputFormatConfiguration.
@Override
public Map<String, String> getInputFormatConfiguration() {
Collection<PartitionKey> inputKeys = getInputKeys();
List<Location> inputLocations = new ArrayList<>(inputKeys.size());
Map<String, PartitionKey> pathToKey = new HashMap<>(inputKeys.size());
for (PartitionKey key : inputKeys) {
PartitionDetail partition = getPartition(key);
String path = Objects.requireNonNull(partition).getRelativePath();
Location partitionLocation = files.getLocation(path);
inputLocations.add(partitionLocation);
pathToKey.put(partitionLocation.toURI().toString(), key);
}
Map<String, String> inputFormatConfiguration = files.getInputFormatConfiguration(inputLocations);
inputFormatConfiguration.put(PATH_TO_PARTITIONING_MAPPING, GSON.toJson(pathToKey));
return inputFormatConfiguration;
}
use of co.cask.cdap.api.dataset.lib.PartitionKey in project cdap by caskdata.
the class PartitionedFileSetDefinition method updateArgumentsIfNeeded.
// if the arguments do not contain an output location, generate one from the partition key (if present)
protected static Map<String, String> updateArgumentsIfNeeded(Map<String, String> arguments, Partitioning partitioning) {
if (FileSetArguments.getOutputPath(arguments) == null) {
PartitionKey key = PartitionedFileSetArguments.getOutputPartitionKey(arguments, partitioning);
// we need to copy the map, to avoid modifying the passed-in map
arguments = Maps.newHashMap(arguments);
if (key != null) {
FileSetArguments.setOutputPath(arguments, PartitionedFileSetDataset.getOutputPath(key, partitioning));
} else if (PartitionedFileSetArguments.getDynamicPartitioner(arguments) != null) {
// when using DynamicPartitioner, use the baseLocation of the fileSet as the output location
FileSetArguments.setBaseOutputPath(arguments);
}
}
return arguments;
}
use of co.cask.cdap.api.dataset.lib.PartitionKey in project cdap by caskdata.
the class TimePartitionedFileSetDataset method getPartitionOutput.
@Override
public TimePartitionOutput getPartitionOutput(long time) {
if (isExternal) {
throw new UnsupportedOperationException("Output is not supported for external time-partitioned file set '" + spec.getName() + "'");
}
PartitionKey key = partitionKeyForTime(time);
assertNotExists(key, true);
return new BasicTimePartitionOutput(this, getOutputPath(key), key);
}
use of co.cask.cdap.api.dataset.lib.PartitionKey in project cdap by caskdata.
the class TimePartitionedFileSetDefinition method updateArgumentsIfNeeded.
// if the arguments do not contain an output path, but an output partition time, generate an output path from that;
// also convert the output partition time to a partition key and add it to the arguments;
// also call the super class' method to update arguments if it needs to
protected Map<String, String> updateArgumentsIfNeeded(Map<String, String> arguments) {
Long time = TimePartitionedFileSetArguments.getOutputPartitionTime(arguments);
if (time != null) {
// set the output path according to partition time
if (FileSetArguments.getOutputPath(arguments) == null) {
String outputPathFormat = TimePartitionedFileSetArguments.getOutputPathFormat(arguments);
String path;
if (Strings.isNullOrEmpty(outputPathFormat)) {
path = String.format("%tF/%tH-%tM.%d", time, time, time, time);
} else {
SimpleDateFormat format = new SimpleDateFormat(outputPathFormat);
String timeZoneID = TimePartitionedFileSetArguments.getOutputPathTimeZone(arguments);
if (!Strings.isNullOrEmpty(timeZoneID)) {
format.setTimeZone(TimeZone.getTimeZone(timeZoneID));
}
path = format.format(new Date(time));
}
arguments = Maps.newHashMap(arguments);
FileSetArguments.setOutputPath(arguments, path);
}
// add the corresponding partition key to the arguments
PartitionKey outputKey = TimePartitionedFileSetDataset.partitionKeyForTime(time);
PartitionedFileSetArguments.setOutputPartitionKey(arguments, outputKey);
}
// delegate to super class for anything it needs to do
return updateArgumentsIfNeeded(arguments, TimePartitionedFileSetDataset.PARTITIONING);
}
Aggregations