Examples with HiveBucket - com.facebook.presto.hive.HiveBucketing.HiveBucket

Example 1 with HiveBucket

use of com.facebook.presto.hive.HiveBucketing.HiveBucket in project presto by prestodb.

the class HivePartitionManager method getPartitions.

public HivePartitionResult getPartitions(SemiTransactionalHiveMetastore metastore, ConnectorTableHandle tableHandle, Constraint<ColumnHandle> constraint) {
    HiveTableHandle hiveTableHandle = (HiveTableHandle) tableHandle;
    TupleDomain<ColumnHandle> effectivePredicate = constraint.getSummary();
    SchemaTableName tableName = hiveTableHandle.getSchemaTableName();
    Table table = getTable(metastore, tableName);
    Optional<HiveBucketHandle> hiveBucketHandle = getHiveBucketHandle(connectorId, table);
    List<HiveColumnHandle> partitionColumns = getPartitionKeyColumnHandles(connectorId, table);
    List<HiveBucket> buckets = getHiveBucketNumbers(table, effectivePredicate);
    TupleDomain<HiveColumnHandle> compactEffectivePredicate = toCompactTupleDomain(effectivePredicate, domainCompactionThreshold);
    if (effectivePredicate.isNone()) {
        return new HivePartitionResult(partitionColumns, ImmutableList.of(), TupleDomain.none(), TupleDomain.none(), hiveBucketHandle);
    }
    if (partitionColumns.isEmpty()) {
        return new HivePartitionResult(partitionColumns, ImmutableList.of(new HivePartition(tableName, compactEffectivePredicate, buckets)), effectivePredicate, TupleDomain.none(), hiveBucketHandle);
    }
    List<Type> partitionTypes = partitionColumns.stream().map(column -> typeManager.getType(column.getTypeSignature())).collect(toList());
    List<String> partitionNames = getFilteredPartitionNames(metastore, tableName, partitionColumns, effectivePredicate);
    // do a final pass to filter based on fields that could not be used to filter the partitions
    int partitionCount = 0;
    ImmutableList.Builder<HivePartition> partitions = ImmutableList.builder();
    for (String partitionName : partitionNames) {
        Optional<Map<ColumnHandle, NullableValue>> values = parseValuesAndFilterPartition(partitionName, partitionColumns, partitionTypes, constraint);
        if (values.isPresent()) {
            if (partitionCount == maxPartitions) {
                throw new PrestoException(HIVE_EXCEEDED_PARTITION_LIMIT, format("Query over table '%s' can potentially read more than %s partitions", hiveTableHandle.getSchemaTableName(), maxPartitions));
            }
            partitionCount++;
            partitions.add(new HivePartition(tableName, compactEffectivePredicate, partitionName, values.get(), buckets));
        }
    }
    // All partition key domains will be fully evaluated, so we don't need to include those
    TupleDomain<ColumnHandle> remainingTupleDomain = TupleDomain.withColumnDomains(Maps.filterKeys(effectivePredicate.getDomains().get(), not(Predicates.in(partitionColumns))));
    TupleDomain<ColumnHandle> enforcedTupleDomain = TupleDomain.withColumnDomains(Maps.filterKeys(effectivePredicate.getDomains().get(), Predicates.in(partitionColumns)));
    return new HivePartitionResult(partitionColumns, partitions.build(), remainingTupleDomain, enforcedTupleDomain, hiveBucketHandle);
}

Also used : DateTimeZone(org.joda.time.DateTimeZone) Table(com.facebook.presto.hive.metastore.Table) TypeManager(com.facebook.presto.spi.type.TypeManager) Slice(io.airlift.slice.Slice) HiveUtil.getPartitionKeyColumnHandles(com.facebook.presto.hive.HiveUtil.getPartitionKeyColumnHandles) Strings.isNullOrEmpty(com.google.common.base.Strings.isNullOrEmpty) ConnectorTableHandle(com.facebook.presto.spi.ConnectorTableHandle) ProtectMode(org.apache.hadoop.hive.metastore.ProtectMode) PrestoException(com.facebook.presto.spi.PrestoException) ArrayList(java.util.ArrayList) Inject(javax.inject.Inject) Preconditions.checkArgument(com.google.common.base.Preconditions.checkArgument) SchemaTableName(com.facebook.presto.spi.SchemaTableName) ImmutableList(com.google.common.collect.ImmutableList) Predicates.not(com.google.common.base.Predicates.not) ValueSet(com.facebook.presto.spi.predicate.ValueSet) Type(com.facebook.presto.spi.type.Type) Map(java.util.Map) Objects.requireNonNull(java.util.Objects.requireNonNull) Predicates(com.google.common.base.Predicates) HiveBucket(com.facebook.presto.hive.HiveBucketing.HiveBucket) NullableValue(com.facebook.presto.spi.predicate.NullableValue) HIVE_EXCEEDED_PARTITION_LIMIT(com.facebook.presto.hive.HiveErrorCode.HIVE_EXCEEDED_PARTITION_LIMIT) ImmutableMap(com.google.common.collect.ImmutableMap) ProtectMode.getProtectModeFromString(org.apache.hadoop.hive.metastore.ProtectMode.getProtectModeFromString) Constraint(com.facebook.presto.spi.Constraint) SemiTransactionalHiveMetastore(com.facebook.presto.hive.metastore.SemiTransactionalHiveMetastore) HiveBucketing.getHiveBucketHandle(com.facebook.presto.hive.HiveBucketing.getHiveBucketHandle) Maps(com.google.common.collect.Maps) String.format(java.lang.String.format) TupleDomain(com.facebook.presto.spi.predicate.TupleDomain) Domain(com.facebook.presto.spi.predicate.Domain) List(java.util.List) Collectors.toList(java.util.stream.Collectors.toList) TableNotFoundException(com.facebook.presto.spi.TableNotFoundException) NOT_SUPPORTED(com.facebook.presto.spi.StandardErrorCode.NOT_SUPPORTED) ColumnHandle(com.facebook.presto.spi.ColumnHandle) HiveBucketing.getHiveBucketNumbers(com.facebook.presto.hive.HiveBucketing.getHiveBucketNumbers) FileUtils(org.apache.hadoop.hive.common.FileUtils) Optional(java.util.Optional) HiveUtil.parsePartitionValue(com.facebook.presto.hive.HiveUtil.parsePartitionValue) HiveBucket(com.facebook.presto.hive.HiveBucketing.HiveBucket) ImmutableList(com.google.common.collect.ImmutableList) PrestoException(com.facebook.presto.spi.PrestoException) ProtectMode.getProtectModeFromString(org.apache.hadoop.hive.metastore.ProtectMode.getProtectModeFromString) ColumnHandle(com.facebook.presto.spi.ColumnHandle) Table(com.facebook.presto.hive.metastore.Table) SchemaTableName(com.facebook.presto.spi.SchemaTableName) Constraint(com.facebook.presto.spi.Constraint) Type(com.facebook.presto.spi.type.Type) HiveBucketing.getHiveBucketHandle(com.facebook.presto.hive.HiveBucketing.getHiveBucketHandle) Map(java.util.Map) ImmutableMap(com.google.common.collect.ImmutableMap)

Example 2 with HiveBucket

use of com.facebook.presto.hive.HiveBucketing.HiveBucket in project presto by prestodb.

the class BackgroundHiveSplitLoader method loadPartition.

private void loadPartition(HivePartitionMetadata partition) throws IOException {
    String partitionName = partition.getHivePartition().getPartitionId();
    Properties schema = getPartitionSchema(table, partition.getPartition());
    List<HivePartitionKey> partitionKeys = getPartitionKeys(table, partition.getPartition());
    TupleDomain<HiveColumnHandle> effectivePredicate = partition.getHivePartition().getEffectivePredicate();
    Path path = new Path(getPartitionLocation(table, partition.getPartition()));
    Configuration configuration = hdfsEnvironment.getConfiguration(path);
    InputFormat<?, ?> inputFormat = getInputFormat(configuration, schema, false);
    FileSystem fs = hdfsEnvironment.getFileSystem(session.getUser(), path);
    if (inputFormat instanceof SymlinkTextInputFormat) {
        if (bucketHandle.isPresent()) {
            throw new PrestoException(StandardErrorCode.NOT_SUPPORTED, "Bucketed table in SymlinkTextInputFormat is not yet supported");
        }
        // TODO: This should use an iterator like the HiveFileIterator
        for (Path targetPath : getTargetPathsFromSymlink(fs, path)) {
            // The input should be in TextInputFormat.
            TextInputFormat targetInputFormat = new TextInputFormat();
            // get the configuration for the target path -- it may be a different hdfs instance
            Configuration targetConfiguration = hdfsEnvironment.getConfiguration(targetPath);
            JobConf targetJob = new JobConf(targetConfiguration);
            targetJob.setInputFormat(TextInputFormat.class);
            targetInputFormat.configure(targetJob);
            FileInputFormat.setInputPaths(targetJob, targetPath);
            InputSplit[] targetSplits = targetInputFormat.getSplits(targetJob, 0);
            if (addSplitsToSource(targetSplits, partitionName, partitionKeys, schema, effectivePredicate, partition.getColumnCoercions())) {
                return;
            }
        }
        return;
    }
    // on the input format to obtain file splits.
    if (shouldUseFileSplitsFromInputFormat(inputFormat)) {
        JobConf jobConf = new JobConf(configuration);
        FileInputFormat.setInputPaths(jobConf, path);
        InputSplit[] splits = inputFormat.getSplits(jobConf, 0);
        addSplitsToSource(splits, partitionName, partitionKeys, schema, effectivePredicate, partition.getColumnCoercions());
        return;
    }
    // If only one bucket could match: load that one file
    HiveFileIterator iterator = new HiveFileIterator(path, fs, directoryLister, namenodeStats, partitionName, inputFormat, schema, partitionKeys, effectivePredicate, partition.getColumnCoercions());
    if (!buckets.isEmpty()) {
        int bucketCount = buckets.get(0).getBucketCount();
        List<LocatedFileStatus> list = listAndSortBucketFiles(iterator, bucketCount);
        List<Iterator<HiveSplit>> iteratorList = new ArrayList<>();
        for (HiveBucket bucket : buckets) {
            int bucketNumber = bucket.getBucketNumber();
            LocatedFileStatus file = list.get(bucketNumber);
            boolean splittable = isSplittable(iterator.getInputFormat(), hdfsEnvironment.getFileSystem(session.getUser(), file.getPath()), file.getPath());
            iteratorList.add(createHiveSplitIterator(iterator.getPartitionName(), file.getPath().toString(), file.getBlockLocations(), 0, file.getLen(), iterator.getSchema(), iterator.getPartitionKeys(), splittable, session, OptionalInt.of(bucketNumber), effectivePredicate, partition.getColumnCoercions()));
        }
        addToHiveSplitSourceRoundRobin(iteratorList);
        return;
    }
    // If table is bucketed: list the directory, sort, tag with bucket id
    if (bucketHandle.isPresent()) {
        // HiveFileIterator skips hidden files automatically.
        int bucketCount = bucketHandle.get().getBucketCount();
        List<LocatedFileStatus> list = listAndSortBucketFiles(iterator, bucketCount);
        List<Iterator<HiveSplit>> iteratorList = new ArrayList<>();
        for (int bucketIndex = 0; bucketIndex < bucketCount; bucketIndex++) {
            LocatedFileStatus file = list.get(bucketIndex);
            boolean splittable = isSplittable(iterator.getInputFormat(), hdfsEnvironment.getFileSystem(session.getUser(), file.getPath()), file.getPath());
            iteratorList.add(createHiveSplitIterator(iterator.getPartitionName(), file.getPath().toString(), file.getBlockLocations(), 0, file.getLen(), iterator.getSchema(), iterator.getPartitionKeys(), splittable, session, OptionalInt.of(bucketIndex), iterator.getEffectivePredicate(), partition.getColumnCoercions()));
        }
        addToHiveSplitSourceRoundRobin(iteratorList);
        return;
    }
    fileIterators.addLast(iterator);
}

Also used : HiveBucket(com.facebook.presto.hive.HiveBucketing.HiveBucket) Configuration(org.apache.hadoop.conf.Configuration) ArrayList(java.util.ArrayList) PrestoException(com.facebook.presto.spi.PrestoException) Properties(java.util.Properties) SymlinkTextInputFormat(org.apache.hadoop.hive.ql.io.SymlinkTextInputFormat) FileSystem(org.apache.hadoop.fs.FileSystem) PeekingIterator(com.google.common.collect.PeekingIterator) Iterator(java.util.Iterator) AbstractIterator(com.google.common.collect.AbstractIterator) HiveFileIterator(com.facebook.presto.hive.util.HiveFileIterator) HiveFileIterator(com.facebook.presto.hive.util.HiveFileIterator) JobConf(org.apache.hadoop.mapred.JobConf) InputSplit(org.apache.hadoop.mapred.InputSplit) Path(org.apache.hadoop.fs.Path) LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus) TextInputFormat(org.apache.hadoop.mapred.TextInputFormat) SymlinkTextInputFormat(org.apache.hadoop.hive.ql.io.SymlinkTextInputFormat)

Aggregations

HiveBucket (com.facebook.presto.hive.HiveBucketing.HiveBucket)2 PrestoException (com.facebook.presto.spi.PrestoException)2 ArrayList (java.util.ArrayList)2 HiveBucketing.getHiveBucketHandle (com.facebook.presto.hive.HiveBucketing.getHiveBucketHandle)1 HiveBucketing.getHiveBucketNumbers (com.facebook.presto.hive.HiveBucketing.getHiveBucketNumbers)1 HIVE_EXCEEDED_PARTITION_LIMIT (com.facebook.presto.hive.HiveErrorCode.HIVE_EXCEEDED_PARTITION_LIMIT)1 HiveUtil.getPartitionKeyColumnHandles (com.facebook.presto.hive.HiveUtil.getPartitionKeyColumnHandles)1 HiveUtil.parsePartitionValue (com.facebook.presto.hive.HiveUtil.parsePartitionValue)1 SemiTransactionalHiveMetastore (com.facebook.presto.hive.metastore.SemiTransactionalHiveMetastore)1 Table (com.facebook.presto.hive.metastore.Table)1 HiveFileIterator (com.facebook.presto.hive.util.HiveFileIterator)1 ColumnHandle (com.facebook.presto.spi.ColumnHandle)1 ConnectorTableHandle (com.facebook.presto.spi.ConnectorTableHandle)1 Constraint (com.facebook.presto.spi.Constraint)1 SchemaTableName (com.facebook.presto.spi.SchemaTableName)1 NOT_SUPPORTED (com.facebook.presto.spi.StandardErrorCode.NOT_SUPPORTED)1 TableNotFoundException (com.facebook.presto.spi.TableNotFoundException)1 Domain (com.facebook.presto.spi.predicate.Domain)1 NullableValue (com.facebook.presto.spi.predicate.NullableValue)1 TupleDomain (com.facebook.presto.spi.predicate.TupleDomain)1