Examples with BucketingVersion - io.trino.plugin.hive.util.HiveBucketing.BucketingVersion

Example 1 with BucketingVersion

use of io.trino.plugin.hive.util.HiveBucketing.BucketingVersion in project trino by trinodb.

the class HiveBucketProperty method fromStorageDescriptor.

public static Optional<HiveBucketProperty> fromStorageDescriptor(Map<String, String> tableParameters, StorageDescriptor storageDescriptor, String tablePartitionName) {
    boolean bucketColsSet = storageDescriptor.isSetBucketCols() && !storageDescriptor.getBucketCols().isEmpty();
    boolean numBucketsSet = storageDescriptor.isSetNumBuckets() && storageDescriptor.getNumBuckets() > 0;
    if (!numBucketsSet) {
        // In Hive, a table is considered as not bucketed when its bucketCols is set but its numBucket is not set.
        return Optional.empty();
    }
    if (!bucketColsSet) {
        throw new TrinoException(HIVE_INVALID_METADATA, "Table/partition metadata has 'numBuckets' set, but 'bucketCols' is not set: " + tablePartitionName);
    }
    List<SortingColumn> sortedBy = ImmutableList.of();
    if (storageDescriptor.isSetSortCols()) {
        sortedBy = storageDescriptor.getSortCols().stream().map(order -> SortingColumn.fromMetastoreApiOrder(order, tablePartitionName)).collect(toImmutableList());
    }
    BucketingVersion bucketingVersion = HiveBucketing.getBucketingVersion(tableParameters);
    return Optional.of(new HiveBucketProperty(storageDescriptor.getBucketCols(), bucketingVersion, storageDescriptor.getNumBuckets(), sortedBy));
}

Also used : BucketingVersion(io.trino.plugin.hive.util.HiveBucketing.BucketingVersion) SortingColumn(io.trino.plugin.hive.metastore.SortingColumn) TrinoException(io.trino.spi.TrinoException)

Example 2 with BucketingVersion

use of io.trino.plugin.hive.util.HiveBucketing.BucketingVersion in project trino by trinodb.

the class TestHiveBucketing method testHashingCompare.

@Test
public void testHashingCompare() {
    assertBucketEquals("string", "Trino rocks", 1132136730, -399107423);
    assertEquals(HiveBucketing.getBucketNumber(1132136730, 4), 2);
    assertEquals(HiveBucketing.getBucketNumber(-399107423, 4), 1);
    assertBucketEquals("boolean", null, 0, 0);
    assertBucketEquals("boolean", true, 1, 1);
    assertBucketEquals("boolean", false, 0, 0);
    assertBucketEquals("tinyint", null, 0, 0);
    assertBucketEquals("tinyint", (byte) 5, 5, 5);
    assertBucketEquals("tinyint", Byte.MIN_VALUE, -128, -128);
    assertBucketEquals("tinyint", Byte.MAX_VALUE, 127, 127);
    assertBucketEquals("smallint", null, 0, 0);
    assertBucketEquals("smallint", (short) 300, 300, 2107031704);
    assertBucketEquals("smallint", Short.MIN_VALUE, -32768, 1342976838);
    assertBucketEquals("smallint", Short.MAX_VALUE, 32767, -684075052);
    assertBucketEquals("int", null, 0, 0);
    assertBucketEquals("int", 300_000, 300000, -678663480);
    assertBucketEquals("int", Integer.MIN_VALUE, -2147483648, 1194881028);
    assertBucketEquals("int", Integer.MAX_VALUE, 2147483647, 1133859967);
    assertBucketEquals("bigint", null, 0, 0);
    assertBucketEquals("bigint", 300_000_000_000L, -647710651, -888935297);
    assertBucketEquals("bigint", Long.MIN_VALUE, -2147483648, 1728983947);
    assertBucketEquals("bigint", Long.MAX_VALUE, -2147483648, -536577852);
    assertBucketEquals("float", null, 0, 0);
    assertBucketEquals("float", 12.34F, 1095069860, -381747602);
    assertBucketEquals("float", -Float.MAX_VALUE, -8388609, 470252243);
    assertBucketEquals("float", Float.MIN_VALUE, 1, 1206721797);
    assertBucketEquals("float", Float.POSITIVE_INFINITY, 2139095040, -292175804);
    assertBucketEquals("float", Float.NEGATIVE_INFINITY, -8388608, -1433270801);
    assertBucketEquals("float", Float.NaN, 2143289344, -480354314);
    // also a NaN
    assertBucketEquals("float", intBitsToFloat(0xffc00000), 2143289344, -480354314);
    // also a NaN
    assertBucketEquals("float", intBitsToFloat(0x7fc00000), 2143289344, -480354314);
    // also a NaN
    assertBucketEquals("float", intBitsToFloat(0x7fc01234), 2143289344, -480354314);
    // also a NaN
    assertBucketEquals("float", intBitsToFloat(0xffc01234), 2143289344, -480354314);
    assertBucketEquals("double", null, 0, 0);
    assertBucketEquals("double", 12.34, 986311098, -2070733568);
    assertBucketEquals("double", -Double.MAX_VALUE, 1048576, 14392725);
    assertBucketEquals("double", Double.MIN_VALUE, 1, -8838199);
    assertBucketEquals("double", Double.POSITIVE_INFINITY, 2146435072, 1614292060);
    assertBucketEquals("double", Double.NEGATIVE_INFINITY, -1048576, 141388605);
    assertBucketEquals("double", Double.NaN, 2146959360, 1138026565);
    // also a NaN
    assertBucketEquals("double", longBitsToDouble(0xfff8000000000000L), 2146959360, 1138026565);
    // also a NaN
    assertBucketEquals("double", longBitsToDouble(0x7ff8123412341234L), 2146959360, 1138026565);
    // also a NaN
    assertBucketEquals("double", longBitsToDouble(0xfff8123412341234L), 2146959360, 1138026565);
    assertBucketEquals("varchar(15)", null, 0, 0);
    assertBucketEquals("varchar(15)", "", 1, -965378730);
    assertBucketEquals("varchar(15)", "test string", -189841218, -138301454);
    // 3-byte UTF-8 sequences (in Basic Plane, i.e. Plane 0)
    assertBucketEquals("varchar(15)", "\u5f3a\u5927\u7684Trino\u5f15\u64ce", 1899852420, 1784416344);
    // 4 code points: 20FFC - 20FFF. 4-byte UTF-8 sequences in Supplementary Plane 2
    assertBucketEquals("varchar(15)", "\uD843\uDFFC\uD843\uDFFD\uD843\uDFFE\uD843\uDFFF", -457487557, -697348811);
    assertBucketEquals("string", null, 0, 0);
    assertBucketEquals("string", "", 0, -965378730);
    assertBucketEquals("string", "test string", -318923937, -138301454);
    // 3-byte UTF-8 sequences (in Basic Plane, i.e. Plane 0)
    assertBucketEquals("string", "\u5f3a\u5927\u7684Trino\u5f15\u64ce", 1688501507, 1784416344);
    // 4 code points: 20FFC - 20FFF. 4-byte UTF-8 sequences in Supplementary Plane 2
    assertBucketEquals("string", "\uD843\uDFFC\uD843\uDFFD\uD843\uDFFE\uD843\uDFFF", -1810797254, -697348811);
    assertBucketEquals("date", null, 0, 0);
    assertBucketEquals("date", Date.valueOf("1970-01-01"), 0, 1362653161);
    assertBucketEquals("date", Date.valueOf("2015-11-19"), 16758, 8542395);
    assertBucketEquals("date", Date.valueOf("1950-11-19"), -6983, -431619185);
    for (BucketingVersion version : BucketingVersion.values()) {
        List<TypeInfo> typeInfos = ImmutableList.of(timestampTypeInfo);
        assertThatThrownBy(() -> version.getBucketHashCode(typeInfos, new Object[] { 0 })).hasMessage("Computation of Hive bucket hashCode is not supported for Hive primitive category: TIMESTAMP");
        TimestampType timestampType = createTimestampType(3);
        BlockBuilder builder = timestampType.createBlockBuilder(null, 1);
        timestampType.writeLong(builder, 0);
        Page page = new Page(builder.build());
        assertThatThrownBy(() -> version.getBucketHashCode(typeInfos, page, 0)).hasMessage("Computation of Hive bucket hashCode is not supported for Hive primitive category: TIMESTAMP");
    }
    assertBucketEquals("array<double>", null, 0, 0);
    assertBucketEquals("array<boolean>", ImmutableList.of(), 0, 0);
    assertBucketEquals("array<smallint>", ImmutableList.of((short) 5, (short) 8, (short) 13), 5066, -905011156);
    assertBucketEquals("array<string>", ImmutableList.of("test1", "test2", "test3", "test4"), 957612994, 1305539282);
    assertBucketEquals("array<array<bigint>>", ImmutableList.of(ImmutableList.of(10L, 20L), ImmutableList.of(-10L, -20L), asList((Object) null)), 326368, 611324477);
    assertBucketEquals("map<float,date>", null, 0, 0);
    assertBucketEquals("map<double,timestamp>", ImmutableMap.of(), 0, 0);
    assertBucketEquals("map<string,bigint>", ImmutableMap.of("key", 123L, "key2", 123456789L, "key3", -123456L), 127880789, -1910999650);
    assertBucketEquals("map<array<double>,map<int,string>>", ImmutableMap.of(ImmutableList.of(12.3, 45.7), ImmutableMap.of(123, "test99")), -34001111, -1565874874);
    // multiple bucketing columns
    assertBucketEquals(ImmutableList.of("float", "array<smallint>", "map<string,bigint>"), ImmutableList.of(12.34F, ImmutableList.of((short) 5, (short) 8, (short) 13), ImmutableMap.of("key", 123L)), 95411006, 932898434);
    assertBucketEquals(ImmutableList.of("double", "array<smallint>", "boolean", "map<string,bigint>", "tinyint"), asList(null, ImmutableList.of((short) 5, (short) 8, (short) 13), null, ImmutableMap.of("key", 123L), null), 154207826, -1120812524);
}

Also used : BucketingVersion(io.trino.plugin.hive.util.HiveBucketing.BucketingVersion) TimestampType.createTimestampType(io.trino.spi.type.TimestampType.createTimestampType) TimestampType(io.trino.spi.type.TimestampType) Page(io.trino.spi.Page) TypeInfoFactory.timestampTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory.timestampTypeInfo) TypeInfo(org.apache.hadoop.hive.serde2.typeinfo.TypeInfo) BlockBuilder(io.trino.spi.block.BlockBuilder) Test(org.testng.annotations.Test)

Example 3 with BucketingVersion

use of io.trino.plugin.hive.util.HiveBucketing.BucketingVersion in project trino by trinodb.

the class TestHivePartitionedBucketFunction method testMultiplePartitions.

@Test(dataProvider = "hiveBucketingVersion")
public void testMultiplePartitions(BucketingVersion hiveBucketingVersion) {
    int numValues = 1024;
    int numBuckets = 10;
    Block bucketColumn = createLongSequenceBlockWithNull(numValues);
    Page bucketedColumnPage = new Page(bucketColumn);
    BucketFunction hiveBucketFunction = bucketFunction(hiveBucketingVersion, numBuckets, ImmutableList.of(HIVE_LONG));
    int numPartitions = 8;
    List<Long> partitionValues = new ArrayList<>();
    for (int i = 0; i < numPartitions - 1; i++) {
        partitionValues.addAll(Collections.nCopies(numValues / numPartitions, i * 348349L));
    }
    partitionValues.addAll(Collections.nCopies(numValues / numPartitions, null));
    Block partitionColumn = createLongsBlock(partitionValues);
    Page page = new Page(bucketColumn, partitionColumn);
    Map<Long, HashMultimap<Integer, Integer>> partitionedBucketPositions = new HashMap<>();
    for (int i = 0; i < numValues; i++) {
        int hiveBucket = hiveBucketFunction.getBucket(bucketedColumnPage, i);
        Long hivePartition = partitionValues.get(i);
        // record list of positions for each combination of hive partition and bucket
        partitionedBucketPositions.computeIfAbsent(hivePartition, ignored -> HashMultimap.create()).put(hiveBucket, i);
    }
    BucketFunction hivePartitionedBucketFunction = partitionedBucketFunction(hiveBucketingVersion, numBuckets, ImmutableList.of(HIVE_LONG), ImmutableList.of(BIGINT), 4000);
    // All positions of a hive partition and bucket should hash to the same partitioned bucket
    for (Map.Entry<Long, HashMultimap<Integer, Integer>> partitionEntry : partitionedBucketPositions.entrySet()) {
        for (Map.Entry<Integer, Collection<Integer>> entry : partitionEntry.getValue().asMap().entrySet()) {
            assertBucketCount(hivePartitionedBucketFunction, page, entry.getValue(), 1);
        }
    }
    assertBucketCount(hivePartitionedBucketFunction, page, IntStream.range(0, numValues).boxed().collect(toImmutableList()), numBuckets * numPartitions);
}

Also used : IntStream(java.util.stream.IntStream) BucketFunction(io.trino.spi.connector.BucketFunction) DataProvider(org.testng.annotations.DataProvider) Collections.min(java.util.Collections.min) Type(io.trino.spi.type.Type) BUCKETING_V2(io.trino.plugin.hive.util.HiveBucketing.BucketingVersion.BUCKETING_V2) Page(io.trino.spi.Page) Assert.assertEquals(org.testng.Assert.assertEquals) Test(org.testng.annotations.Test) HashMap(java.util.HashMap) TypeOperators(io.trino.spi.type.TypeOperators) Multimap(com.google.common.collect.Multimap) ArrayList(java.util.ArrayList) BUCKETING_V1(io.trino.plugin.hive.util.HiveBucketing.BucketingVersion.BUCKETING_V1) HashMultimap(com.google.common.collect.HashMultimap) ImmutableList(com.google.common.collect.ImmutableList) Block(io.trino.spi.block.Block) Map(java.util.Map) HIVE_LONG(io.trino.plugin.hive.HiveType.HIVE_LONG) Collection(java.util.Collection) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) List(java.util.List) BucketingVersion(io.trino.plugin.hive.util.HiveBucketing.BucketingVersion) BIGINT(io.trino.spi.type.BigintType.BIGINT) Collections.max(java.util.Collections.max) BlockAssertions.createLongsBlock(io.trino.block.BlockAssertions.createLongsBlock) BlockAssertions.createLongRepeatBlock(io.trino.block.BlockAssertions.createLongRepeatBlock) BlockBuilder(io.trino.spi.block.BlockBuilder) Collections(java.util.Collections) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) Page(io.trino.spi.Page) HashMultimap(com.google.common.collect.HashMultimap) Block(io.trino.spi.block.Block) BlockAssertions.createLongsBlock(io.trino.block.BlockAssertions.createLongsBlock) BlockAssertions.createLongRepeatBlock(io.trino.block.BlockAssertions.createLongRepeatBlock) Collection(java.util.Collection) BucketFunction(io.trino.spi.connector.BucketFunction) HashMap(java.util.HashMap) Map(java.util.Map) Test(org.testng.annotations.Test)

Example 4 with BucketingVersion

use of io.trino.plugin.hive.util.HiveBucketing.BucketingVersion in project trino by trinodb.

the class BackgroundHiveSplitLoader method loadPartition.

private ListenableFuture<Void> loadPartition(HivePartitionMetadata partition) throws IOException {
    HivePartition hivePartition = partition.getHivePartition();
    String partitionName = hivePartition.getPartitionId();
    Properties schema = getPartitionSchema(table, partition.getPartition());
    List<HivePartitionKey> partitionKeys = getPartitionKeys(table, partition.getPartition());
    TupleDomain<HiveColumnHandle> effectivePredicate = compactEffectivePredicate.transformKeys(HiveColumnHandle.class::cast);
    BooleanSupplier partitionMatchSupplier = createPartitionMatchSupplier(dynamicFilter, hivePartition, getPartitionKeyColumnHandles(table, typeManager));
    if (!partitionMatchSupplier.getAsBoolean()) {
        // Avoid listing files and creating splits from a partition if it has been pruned due to dynamic filters
        return COMPLETED_FUTURE;
    }
    Path path = new Path(getPartitionLocation(table, partition.getPartition()));
    Configuration configuration = hdfsEnvironment.getConfiguration(hdfsContext, path);
    InputFormat<?, ?> inputFormat = getInputFormat(configuration, schema, false);
    FileSystem fs = hdfsEnvironment.getFileSystem(hdfsContext, path);
    boolean s3SelectPushdownEnabled = shouldEnablePushdownForTable(session, table, path.toString(), partition.getPartition());
    // S3 Select pushdown works at the granularity of individual S3 objects,
    // therefore we must not split files when it is enabled.
    // Skip header / footer lines are not splittable except for a special case when skip.header.line.count=1
    boolean splittable = !s3SelectPushdownEnabled && getFooterCount(schema) == 0 && getHeaderCount(schema) <= 1;
    if (inputFormat instanceof SymlinkTextInputFormat) {
        if (tableBucketInfo.isPresent()) {
            throw new TrinoException(NOT_SUPPORTED, "Bucketed table in SymlinkTextInputFormat is not yet supported");
        }
        InputFormat<?, ?> targetInputFormat = getInputFormat(configuration, schema, true);
        List<Path> targetPaths = hdfsEnvironment.doAs(hdfsContext.getIdentity(), () -> getTargetPathsFromSymlink(fs, path));
        Set<Path> parents = targetPaths.stream().map(Path::getParent).distinct().collect(toImmutableSet());
        if (optimizeSymlinkListing && parents.size() == 1 && !recursiveDirWalkerEnabled) {
            Optional<Iterator<InternalHiveSplit>> manifestFileIterator = buildManifestFileIterator(targetInputFormat, partitionName, schema, partitionKeys, effectivePredicate, partitionMatchSupplier, s3SelectPushdownEnabled, partition.getTableToPartitionMapping(), getOnlyElement(parents), targetPaths, splittable);
            if (manifestFileIterator.isPresent()) {
                fileIterators.addLast(manifestFileIterator.get());
                return COMPLETED_FUTURE;
            }
        }
        return createHiveSymlinkSplits(partitionName, targetInputFormat, schema, partitionKeys, effectivePredicate, partitionMatchSupplier, s3SelectPushdownEnabled, partition.getTableToPartitionMapping(), targetPaths);
    }
    Optional<BucketConversion> bucketConversion = Optional.empty();
    boolean bucketConversionRequiresWorkerParticipation = false;
    if (partition.getPartition().isPresent()) {
        Optional<HiveBucketProperty> partitionBucketProperty = partition.getPartition().get().getStorage().getBucketProperty();
        if (tableBucketInfo.isPresent() && partitionBucketProperty.isPresent()) {
            int readBucketCount = tableBucketInfo.get().getReadBucketCount();
            // TODO can partition's bucketing_version be different from table's?
            BucketingVersion bucketingVersion = partitionBucketProperty.get().getBucketingVersion();
            int partitionBucketCount = partitionBucketProperty.get().getBucketCount();
            // Here, it's just trying to see if its needs the BucketConversion.
            if (readBucketCount != partitionBucketCount) {
                bucketConversion = Optional.of(new BucketConversion(bucketingVersion, readBucketCount, partitionBucketCount, tableBucketInfo.get().getBucketColumns()));
                if (readBucketCount > partitionBucketCount) {
                    bucketConversionRequiresWorkerParticipation = true;
                }
            }
        }
    }
    Optional<BucketValidation> bucketValidation = Optional.empty();
    if (isValidateBucketing(session) && tableBucketInfo.isPresent()) {
        BucketSplitInfo info = tableBucketInfo.get();
        bucketValidation = Optional.of(new BucketValidation(info.getBucketingVersion(), info.getTableBucketCount(), info.getBucketColumns()));
    }
    InternalHiveSplitFactory splitFactory = new InternalHiveSplitFactory(fs, partitionName, inputFormat, schema, partitionKeys, effectivePredicate, partitionMatchSupplier, partition.getTableToPartitionMapping(), bucketConversionRequiresWorkerParticipation ? bucketConversion : Optional.empty(), bucketValidation, getMaxInitialSplitSize(session), isForceLocalScheduling(session), s3SelectPushdownEnabled, transaction, maxSplitFileSize);
    // on the input format to obtain file splits.
    if (shouldUseFileSplitsFromInputFormat(inputFormat)) {
        if (tableBucketInfo.isPresent()) {
            throw new TrinoException(NOT_SUPPORTED, "Trino cannot read bucketed partition in an input format with UseFileSplitsFromInputFormat annotation: " + inputFormat.getClass().getSimpleName());
        }
        if (AcidUtils.isTransactionalTable(table.getParameters())) {
            throw new TrinoException(NOT_SUPPORTED, "Hive transactional tables in an input format with UseFileSplitsFromInputFormat annotation are not supported: " + inputFormat.getClass().getSimpleName());
        }
        JobConf jobConf = toJobConf(configuration);
        FileInputFormat.setInputPaths(jobConf, path);
        // Pass SerDes and Table parameters into input format configuration
        fromProperties(schema).forEach(jobConf::set);
        InputSplit[] splits = hdfsEnvironment.doAs(hdfsContext.getIdentity(), () -> inputFormat.getSplits(jobConf, 0));
        return addSplitsToSource(splits, splitFactory);
    }
    List<Path> readPaths;
    List<HdfsFileStatusWithId> fileStatusOriginalFiles = ImmutableList.of();
    AcidInfo.Builder acidInfoBuilder = AcidInfo.builder(path);
    boolean isFullAcid = AcidUtils.isFullAcidTable(table.getParameters());
    if (AcidUtils.isTransactionalTable(table.getParameters())) {
        AcidUtils.Directory directory = hdfsEnvironment.doAs(hdfsContext.getIdentity(), () -> AcidUtils.getAcidState(path, configuration, validWriteIds.orElseThrow(() -> new IllegalStateException("No validWriteIds present")), false, true));
        if (isFullAcid) {
            // From Hive version >= 3.0, delta/base files will always have file '_orc_acid_version' with value >= '2'.
            Path baseOrDeltaPath = directory.getBaseDirectory() != null ? directory.getBaseDirectory() : (directory.getCurrentDirectories().size() > 0 ? directory.getCurrentDirectories().get(0).getPath() : null);
            if (baseOrDeltaPath != null && AcidUtils.OrcAcidVersion.getAcidVersionFromMetaFile(baseOrDeltaPath, fs) >= 2) {
                // Trino cannot read ORC ACID tables with version < 2 (written by Hive older than 3.0)
                // See https://github.com/trinodb/trino/issues/2790#issuecomment-591901728 for more context
                // We perform initial version check based on _orc_acid_version file here.
                // If we cannot verify the version (the _orc_acid_version file may not exist),
                // we will do extra check based on ORC datafile metadata in OrcPageSourceFactory.
                acidInfoBuilder.setOrcAcidVersionValidated(true);
            }
        }
        readPaths = new ArrayList<>();
        // base
        if (directory.getBaseDirectory() != null) {
            readPaths.add(directory.getBaseDirectory());
        }
        // delta directories
        for (AcidUtils.ParsedDelta delta : directory.getCurrentDirectories()) {
            if (!delta.isDeleteDelta()) {
                readPaths.add(delta.getPath());
            }
        }
        // Create a registry of delete_delta directories for the partition
        for (AcidUtils.ParsedDelta delta : directory.getCurrentDirectories()) {
            if (delta.isDeleteDelta()) {
                if (!isFullAcid) {
                    throw new TrinoException(HIVE_BAD_DATA, format("Unexpected delete delta for a non full ACID table '%s'. Would be ignored by the reader: %s", table.getSchemaTableName(), delta.getPath()));
                }
                acidInfoBuilder.addDeleteDelta(delta.getPath());
            }
        }
        // initialize original files status list if present
        fileStatusOriginalFiles = directory.getOriginalFiles();
        for (HdfsFileStatusWithId hdfsFileStatusWithId : fileStatusOriginalFiles) {
            Path originalFilePath = hdfsFileStatusWithId.getFileStatus().getPath();
            long originalFileLength = hdfsFileStatusWithId.getFileStatus().getLen();
            if (originalFileLength == 0) {
                continue;
            }
            // Hive requires "original" files of transactional tables to conform to the bucketed tables naming pattern, to match them with delete deltas.
            int bucketId = getRequiredBucketNumber(originalFilePath);
            acidInfoBuilder.addOriginalFile(originalFilePath, originalFileLength, bucketId);
        }
    } else {
        // TODO https://github.com/trinodb/trino/issues/7603 - we should not referece acidInfoBuilder at allwhen we are not reading from non-ACID table
        // no ACID; no further validation needed
        acidInfoBuilder.setOrcAcidVersionValidated(true);
        readPaths = ImmutableList.of(path);
    }
    // Bucketed partitions are fully loaded immediately since all files must be loaded to determine the file to bucket mapping
    if (tableBucketInfo.isPresent()) {
        // TODO document in addToQueue() that it is sufficient to hold on to last returned future
        ListenableFuture<Void> lastResult = immediateVoidFuture();
        for (Path readPath : readPaths) {
            // list all files in the partition
            List<LocatedFileStatus> files = new ArrayList<>();
            try {
                Iterators.addAll(files, new HiveFileIterator(table, readPath, fs, directoryLister, namenodeStats, FAIL, ignoreAbsentPartitions));
            } catch (HiveFileIterator.NestedDirectoryNotAllowedException e) {
                // Fail here to be on the safe side. This seems to be the same as what Hive does
                throw new TrinoException(HIVE_INVALID_BUCKET_FILES, format("Hive table '%s' is corrupt. Found sub-directory '%s' in bucket directory for partition: %s", table.getSchemaTableName(), e.getNestedDirectoryPath(), splitFactory.getPartitionName()));
            }
            Optional<AcidInfo> acidInfo = isFullAcid ? acidInfoBuilder.build() : Optional.empty();
            lastResult = hiveSplitSource.addToQueue(getBucketedSplits(files, splitFactory, tableBucketInfo.get(), bucketConversion, splittable, acidInfo));
        }
        for (HdfsFileStatusWithId hdfsFileStatusWithId : fileStatusOriginalFiles) {
            List<LocatedFileStatus> locatedFileStatuses = ImmutableList.of((LocatedFileStatus) hdfsFileStatusWithId.getFileStatus());
            Optional<AcidInfo> acidInfo = isFullAcid ? Optional.of(acidInfoBuilder.buildWithRequiredOriginalFiles(getRequiredBucketNumber(hdfsFileStatusWithId.getFileStatus().getPath()))) : Optional.empty();
            lastResult = hiveSplitSource.addToQueue(getBucketedSplits(locatedFileStatuses, splitFactory, tableBucketInfo.get(), bucketConversion, splittable, acidInfo));
        }
        return lastResult;
    }
    for (Path readPath : readPaths) {
        Optional<AcidInfo> acidInfo = isFullAcid ? acidInfoBuilder.build() : Optional.empty();
        fileIterators.addLast(createInternalHiveSplitIterator(readPath, fs, splitFactory, splittable, acidInfo));
    }
    if (!fileStatusOriginalFiles.isEmpty()) {
        fileIterators.addLast(generateOriginalFilesSplits(splitFactory, fileStatusOriginalFiles, splittable, acidInfoBuilder, isFullAcid));
    }
    return COMPLETED_FUTURE;
}

Also used : Configuration(org.apache.hadoop.conf.Configuration) ArrayList(java.util.ArrayList) FileSystem(org.apache.hadoop.fs.FileSystem) HiveFileIterator(io.trino.plugin.hive.util.HiveFileIterator) ConfigurationUtils.toJobConf(io.trino.plugin.hive.util.ConfigurationUtils.toJobConf) JobConf(org.apache.hadoop.mapred.JobConf) InternalHiveSplitFactory(io.trino.plugin.hive.util.InternalHiveSplitFactory) AcidUtils(org.apache.hadoop.hive.ql.io.AcidUtils) HdfsFileStatusWithId(org.apache.hadoop.hive.shims.HadoopShims.HdfsFileStatusWithId) Properties(java.util.Properties) Maps.fromProperties(com.google.common.collect.Maps.fromProperties) SymlinkTextInputFormat(org.apache.hadoop.hive.ql.io.SymlinkTextInputFormat) BucketingVersion(io.trino.plugin.hive.util.HiveBucketing.BucketingVersion) HiveFileIterator(io.trino.plugin.hive.util.HiveFileIterator) Iterator(java.util.Iterator) BooleanSupplier(java.util.function.BooleanSupplier) InputSplit(org.apache.hadoop.mapred.InputSplit) Path(org.apache.hadoop.fs.Path) LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus) BucketValidation(io.trino.plugin.hive.HiveSplit.BucketValidation) TrinoException(io.trino.spi.TrinoException) BucketConversion(io.trino.plugin.hive.HiveSplit.BucketConversion)

Example 5 with BucketingVersion

use of io.trino.plugin.hive.util.HiveBucketing.BucketingVersion in project trino by trinodb.

the class HiveTableProperties method getBucketProperty.

public static Optional<HiveBucketProperty> getBucketProperty(Map<String, Object> tableProperties) {
    List<String> bucketedBy = getBucketedBy(tableProperties);
    List<SortingColumn> sortedBy = getSortedBy(tableProperties);
    int bucketCount = (Integer) tableProperties.get(BUCKET_COUNT_PROPERTY);
    if ((bucketedBy.isEmpty()) && (bucketCount == 0)) {
        if (!sortedBy.isEmpty()) {
            throw new TrinoException(INVALID_TABLE_PROPERTY, format("%s may be specified only when %s is specified", SORTED_BY_PROPERTY, BUCKETED_BY_PROPERTY));
        }
        return Optional.empty();
    }
    if (bucketCount < 0) {
        throw new TrinoException(INVALID_TABLE_PROPERTY, format("%s must be greater than zero", BUCKET_COUNT_PROPERTY));
    }
    if (bucketedBy.isEmpty() || bucketCount == 0) {
        throw new TrinoException(INVALID_TABLE_PROPERTY, format("%s and %s must be specified together", BUCKETED_BY_PROPERTY, BUCKET_COUNT_PROPERTY));
    }
    BucketingVersion bucketingVersion = getBucketingVersion(tableProperties);
    return Optional.of(new HiveBucketProperty(bucketedBy, bucketingVersion, bucketCount, sortedBy));
}

Also used : BucketingVersion(io.trino.plugin.hive.util.HiveBucketing.BucketingVersion) SortingColumn(io.trino.plugin.hive.metastore.SortingColumn) TrinoException(io.trino.spi.TrinoException)

Aggregations

BucketingVersion (io.trino.plugin.hive.util.HiveBucketing.BucketingVersion)5 TrinoException (io.trino.spi.TrinoException)3 SortingColumn (io.trino.plugin.hive.metastore.SortingColumn)2 Page (io.trino.spi.Page)2 BlockBuilder (io.trino.spi.block.BlockBuilder)2 ArrayList (java.util.ArrayList)2 Test (org.testng.annotations.Test)2 HashMultimap (com.google.common.collect.HashMultimap)1 ImmutableList (com.google.common.collect.ImmutableList)1 ImmutableList.toImmutableList (com.google.common.collect.ImmutableList.toImmutableList)1 Maps.fromProperties (com.google.common.collect.Maps.fromProperties)1 Multimap (com.google.common.collect.Multimap)1 BlockAssertions.createLongRepeatBlock (io.trino.block.BlockAssertions.createLongRepeatBlock)1 BlockAssertions.createLongsBlock (io.trino.block.BlockAssertions.createLongsBlock)1 BucketConversion (io.trino.plugin.hive.HiveSplit.BucketConversion)1 BucketValidation (io.trino.plugin.hive.HiveSplit.BucketValidation)1 HIVE_LONG (io.trino.plugin.hive.HiveType.HIVE_LONG)1 ConfigurationUtils.toJobConf (io.trino.plugin.hive.util.ConfigurationUtils.toJobConf)1 BUCKETING_V1 (io.trino.plugin.hive.util.HiveBucketing.BucketingVersion.BUCKETING_V1)1 BUCKETING_V2 (io.trino.plugin.hive.util.HiveBucketing.BucketingVersion.BUCKETING_V2)1