use of com.facebook.presto.hive.HiveBucketing.HiveBucket in project presto by prestodb.
the class HivePartitionManager method getPartitions.
public HivePartitionResult getPartitions(SemiTransactionalHiveMetastore metastore, ConnectorTableHandle tableHandle, Constraint<ColumnHandle> constraint) {
HiveTableHandle hiveTableHandle = (HiveTableHandle) tableHandle;
TupleDomain<ColumnHandle> effectivePredicate = constraint.getSummary();
SchemaTableName tableName = hiveTableHandle.getSchemaTableName();
Table table = getTable(metastore, tableName);
Optional<HiveBucketHandle> hiveBucketHandle = getHiveBucketHandle(connectorId, table);
List<HiveColumnHandle> partitionColumns = getPartitionKeyColumnHandles(connectorId, table);
List<HiveBucket> buckets = getHiveBucketNumbers(table, effectivePredicate);
TupleDomain<HiveColumnHandle> compactEffectivePredicate = toCompactTupleDomain(effectivePredicate, domainCompactionThreshold);
if (effectivePredicate.isNone()) {
return new HivePartitionResult(partitionColumns, ImmutableList.of(), TupleDomain.none(), TupleDomain.none(), hiveBucketHandle);
}
if (partitionColumns.isEmpty()) {
return new HivePartitionResult(partitionColumns, ImmutableList.of(new HivePartition(tableName, compactEffectivePredicate, buckets)), effectivePredicate, TupleDomain.none(), hiveBucketHandle);
}
List<Type> partitionTypes = partitionColumns.stream().map(column -> typeManager.getType(column.getTypeSignature())).collect(toList());
List<String> partitionNames = getFilteredPartitionNames(metastore, tableName, partitionColumns, effectivePredicate);
// do a final pass to filter based on fields that could not be used to filter the partitions
int partitionCount = 0;
ImmutableList.Builder<HivePartition> partitions = ImmutableList.builder();
for (String partitionName : partitionNames) {
Optional<Map<ColumnHandle, NullableValue>> values = parseValuesAndFilterPartition(partitionName, partitionColumns, partitionTypes, constraint);
if (values.isPresent()) {
if (partitionCount == maxPartitions) {
throw new PrestoException(HIVE_EXCEEDED_PARTITION_LIMIT, format("Query over table '%s' can potentially read more than %s partitions", hiveTableHandle.getSchemaTableName(), maxPartitions));
}
partitionCount++;
partitions.add(new HivePartition(tableName, compactEffectivePredicate, partitionName, values.get(), buckets));
}
}
// All partition key domains will be fully evaluated, so we don't need to include those
TupleDomain<ColumnHandle> remainingTupleDomain = TupleDomain.withColumnDomains(Maps.filterKeys(effectivePredicate.getDomains().get(), not(Predicates.in(partitionColumns))));
TupleDomain<ColumnHandle> enforcedTupleDomain = TupleDomain.withColumnDomains(Maps.filterKeys(effectivePredicate.getDomains().get(), Predicates.in(partitionColumns)));
return new HivePartitionResult(partitionColumns, partitions.build(), remainingTupleDomain, enforcedTupleDomain, hiveBucketHandle);
}
use of com.facebook.presto.hive.HiveBucketing.HiveBucket in project presto by prestodb.
the class BackgroundHiveSplitLoader method loadPartition.
private void loadPartition(HivePartitionMetadata partition) throws IOException {
String partitionName = partition.getHivePartition().getPartitionId();
Properties schema = getPartitionSchema(table, partition.getPartition());
List<HivePartitionKey> partitionKeys = getPartitionKeys(table, partition.getPartition());
TupleDomain<HiveColumnHandle> effectivePredicate = partition.getHivePartition().getEffectivePredicate();
Path path = new Path(getPartitionLocation(table, partition.getPartition()));
Configuration configuration = hdfsEnvironment.getConfiguration(path);
InputFormat<?, ?> inputFormat = getInputFormat(configuration, schema, false);
FileSystem fs = hdfsEnvironment.getFileSystem(session.getUser(), path);
if (inputFormat instanceof SymlinkTextInputFormat) {
if (bucketHandle.isPresent()) {
throw new PrestoException(StandardErrorCode.NOT_SUPPORTED, "Bucketed table in SymlinkTextInputFormat is not yet supported");
}
// TODO: This should use an iterator like the HiveFileIterator
for (Path targetPath : getTargetPathsFromSymlink(fs, path)) {
// The input should be in TextInputFormat.
TextInputFormat targetInputFormat = new TextInputFormat();
// get the configuration for the target path -- it may be a different hdfs instance
Configuration targetConfiguration = hdfsEnvironment.getConfiguration(targetPath);
JobConf targetJob = new JobConf(targetConfiguration);
targetJob.setInputFormat(TextInputFormat.class);
targetInputFormat.configure(targetJob);
FileInputFormat.setInputPaths(targetJob, targetPath);
InputSplit[] targetSplits = targetInputFormat.getSplits(targetJob, 0);
if (addSplitsToSource(targetSplits, partitionName, partitionKeys, schema, effectivePredicate, partition.getColumnCoercions())) {
return;
}
}
return;
}
// on the input format to obtain file splits.
if (shouldUseFileSplitsFromInputFormat(inputFormat)) {
JobConf jobConf = new JobConf(configuration);
FileInputFormat.setInputPaths(jobConf, path);
InputSplit[] splits = inputFormat.getSplits(jobConf, 0);
addSplitsToSource(splits, partitionName, partitionKeys, schema, effectivePredicate, partition.getColumnCoercions());
return;
}
// If only one bucket could match: load that one file
HiveFileIterator iterator = new HiveFileIterator(path, fs, directoryLister, namenodeStats, partitionName, inputFormat, schema, partitionKeys, effectivePredicate, partition.getColumnCoercions());
if (!buckets.isEmpty()) {
int bucketCount = buckets.get(0).getBucketCount();
List<LocatedFileStatus> list = listAndSortBucketFiles(iterator, bucketCount);
List<Iterator<HiveSplit>> iteratorList = new ArrayList<>();
for (HiveBucket bucket : buckets) {
int bucketNumber = bucket.getBucketNumber();
LocatedFileStatus file = list.get(bucketNumber);
boolean splittable = isSplittable(iterator.getInputFormat(), hdfsEnvironment.getFileSystem(session.getUser(), file.getPath()), file.getPath());
iteratorList.add(createHiveSplitIterator(iterator.getPartitionName(), file.getPath().toString(), file.getBlockLocations(), 0, file.getLen(), iterator.getSchema(), iterator.getPartitionKeys(), splittable, session, OptionalInt.of(bucketNumber), effectivePredicate, partition.getColumnCoercions()));
}
addToHiveSplitSourceRoundRobin(iteratorList);
return;
}
// If table is bucketed: list the directory, sort, tag with bucket id
if (bucketHandle.isPresent()) {
// HiveFileIterator skips hidden files automatically.
int bucketCount = bucketHandle.get().getBucketCount();
List<LocatedFileStatus> list = listAndSortBucketFiles(iterator, bucketCount);
List<Iterator<HiveSplit>> iteratorList = new ArrayList<>();
for (int bucketIndex = 0; bucketIndex < bucketCount; bucketIndex++) {
LocatedFileStatus file = list.get(bucketIndex);
boolean splittable = isSplittable(iterator.getInputFormat(), hdfsEnvironment.getFileSystem(session.getUser(), file.getPath()), file.getPath());
iteratorList.add(createHiveSplitIterator(iterator.getPartitionName(), file.getPath().toString(), file.getBlockLocations(), 0, file.getLen(), iterator.getSchema(), iterator.getPartitionKeys(), splittable, session, OptionalInt.of(bucketIndex), iterator.getEffectivePredicate(), partition.getColumnCoercions()));
}
addToHiveSplitSourceRoundRobin(iteratorList);
return;
}
fileIterators.addLast(iterator);
}
Aggregations