Search in sources :

Example 1 with BucketingVersion

use of io.prestosql.plugin.hive.HiveBucketing.BucketingVersion in project hetu-core by openlookeng.

the class GlueToPrestoConverter method setStorageBuilder.

private static void setStorageBuilder(StorageDescriptor sd, Storage.Builder storageBuilder) {
    requireNonNull(sd.getSerdeInfo(), "StorageDescriptor SerDeInfo is null");
    SerDeInfo serdeInfo = sd.getSerdeInfo();
    Optional<HiveBucketProperty> bucketProperty = Optional.empty();
    if (sd.getNumberOfBuckets() > 0) {
        if (isNullOrEmpty(sd.getBucketColumns())) {
            throw new PrestoException(HiveErrorCode.HIVE_INVALID_METADATA, "Table/partition metadata has 'numBuckets' set, but 'bucketCols' is not set");
        }
        List<SortingColumn> sortedBy = ImmutableList.of();
        if (!isNullOrEmpty(sd.getSortColumns())) {
            sortedBy = sd.getSortColumns().stream().map(column -> new SortingColumn(column.getColumn(), SortingColumn.Order.fromMetastoreApiOrder(column.getSortOrder(), "unknown"))).collect(toImmutableList());
        }
        BucketingVersion bucketingVersion = HiveBucketing.getBucketingVersion(sd.getParameters());
        bucketProperty = Optional.of(new HiveBucketProperty(sd.getBucketColumns(), bucketingVersion, sd.getNumberOfBuckets(), sortedBy));
    }
    storageBuilder.setStorageFormat(StorageFormat.createNullable(serdeInfo.getSerializationLibrary(), sd.getInputFormat(), sd.getOutputFormat())).setLocation(nullToEmpty(sd.getLocation())).setBucketProperty(bucketProperty).setSkewed(sd.getSkewedInfo() != null && !isNullOrEmpty(sd.getSkewedInfo().getSkewedColumnNames())).setSerdeParameters(firstNonNull(serdeInfo.getParameters(), ImmutableMap.of())).build();
}
Also used : BucketingVersion(io.prestosql.plugin.hive.HiveBucketing.BucketingVersion) HiveBucketProperty(io.prestosql.plugin.hive.HiveBucketProperty) SortingColumn(io.prestosql.plugin.hive.metastore.SortingColumn) SerDeInfo(com.amazonaws.services.glue.model.SerDeInfo) PrestoException(io.prestosql.spi.PrestoException)

Example 2 with BucketingVersion

use of io.prestosql.plugin.hive.HiveBucketing.BucketingVersion in project hetu-core by openlookeng.

the class TestHiveBucketing method testHashingCompare.

@Test
public void testHashingCompare() {
    assertBucketEquals("boolean", null, 0, 0);
    assertBucketEquals("boolean", true, 1, 1);
    assertBucketEquals("boolean", false, 0, 0);
    assertBucketEquals("tinyint", null, 0, 0);
    assertBucketEquals("tinyint", (byte) 5, 5, 5);
    assertBucketEquals("tinyint", Byte.MIN_VALUE, -128, -128);
    assertBucketEquals("tinyint", Byte.MAX_VALUE, 127, 127);
    assertBucketEquals("smallint", null, 0, 0);
    assertBucketEquals("smallint", (short) 300, 300, 2107031704);
    assertBucketEquals("smallint", Short.MIN_VALUE, -32768, 1342976838);
    assertBucketEquals("smallint", Short.MAX_VALUE, 32767, -684075052);
    assertBucketEquals("int", null, 0, 0);
    assertBucketEquals("int", 300_000, 300000, -678663480);
    assertBucketEquals("int", Integer.MIN_VALUE, -2147483648, 1194881028);
    assertBucketEquals("int", Integer.MAX_VALUE, 2147483647, 1133859967);
    assertBucketEquals("bigint", null, 0, 0);
    assertBucketEquals("bigint", 300_000_000_000L, -647710651, -888935297);
    assertBucketEquals("bigint", Long.MIN_VALUE, -2147483648, 1728983947);
    assertBucketEquals("bigint", Long.MAX_VALUE, -2147483648, -536577852);
    assertBucketEquals("float", null, 0, 0);
    assertBucketEquals("float", 12.34F, 1095069860, -381747602);
    assertBucketEquals("float", -Float.MAX_VALUE, -8388609, 470252243);
    assertBucketEquals("float", Float.MIN_VALUE, 1, 1206721797);
    assertBucketEquals("float", Float.POSITIVE_INFINITY, 2139095040, -292175804);
    assertBucketEquals("float", Float.NEGATIVE_INFINITY, -8388608, -1433270801);
    assertBucketEquals("float", Float.NaN, 2143289344, -480354314);
    // also a NaN
    assertBucketEquals("float", intBitsToFloat(0xffc00000), 2143289344, -480354314);
    // also a NaN
    assertBucketEquals("float", intBitsToFloat(0x7fc00000), 2143289344, -480354314);
    // also a NaN
    assertBucketEquals("float", intBitsToFloat(0x7fc01234), 2143289344, -480354314);
    // also a NaN
    assertBucketEquals("float", intBitsToFloat(0xffc01234), 2143289344, -480354314);
    assertBucketEquals("double", null, 0, 0);
    assertBucketEquals("double", 12.34, 986311098, -2070733568);
    assertBucketEquals("double", -Double.MAX_VALUE, 1048576, 14392725);
    assertBucketEquals("double", Double.MIN_VALUE, 1, -8838199);
    assertBucketEquals("double", Double.POSITIVE_INFINITY, 2146435072, 1614292060);
    assertBucketEquals("double", Double.NEGATIVE_INFINITY, -1048576, 141388605);
    assertBucketEquals("double", Double.NaN, 2146959360, 1138026565);
    // also a NaN
    assertBucketEquals("double", longBitsToDouble(0xfff8000000000000L), 2146959360, 1138026565);
    // also a NaN
    assertBucketEquals("double", longBitsToDouble(0x7ff8123412341234L), 2146959360, 1138026565);
    // also a NaN
    assertBucketEquals("double", longBitsToDouble(0xfff8123412341234L), 2146959360, 1138026565);
    assertBucketEquals("varchar(15)", null, 0, 0);
    assertBucketEquals("varchar(15)", "", 1, -965378730);
    assertBucketEquals("varchar(15)", "test string", -189841218, -138301454);
    // 3-byte UTF-8 sequences (in Basic Plane, i.e. Plane 0)
    assertBucketEquals("varchar(15)", "\u5f3a\u5927\u7684Hetu\u5f15\u64ce", 481023052, 1436831192);
    // 4 code points: 20FFC - 20FFF. 4-byte UTF-8 sequences in Supplementary Plane 2
    assertBucketEquals("varchar(15)", "\uD843\uDFFC\uD843\uDFFD\uD843\uDFFE\uD843\uDFFF", -457487557, -697348811);
    assertBucketEquals("string", null, 0, 0);
    assertBucketEquals("string", "", 0, -965378730);
    assertBucketEquals("string", "test string", -318923937, -138301454);
    // 3-byte UTF-8 sequences (in Basic Plane, i.e. Plane 0)
    assertBucketEquals("string", "\u5f3a\u5927\u7684Hetu\u5f15\u64ce", 889847277, 1436831192);
    // 4 code points: 20FFC - 20FFF. 4-byte UTF-8 sequences in Supplementary Plane 2
    assertBucketEquals("string", "\uD843\uDFFC\uD843\uDFFD\uD843\uDFFE\uD843\uDFFF", -1810797254, -697348811);
    assertBucketEquals("char(6)", null, 0, 0);
    assertBucketEquals("char(6)", "", 1, -965378730);
    assertBucketEquals("char(6)", "test_1", 10333957, 1284522943);
    assertBucketEquals("date", null, 0, 0);
    assertBucketEquals("date", Date.valueOf("1970-01-01"), 0, 1362653161);
    assertBucketEquals("date", Date.valueOf("2015-11-19"), 16758, 8542395);
    assertBucketEquals("date", Date.valueOf("1950-11-19"), -6983, -431619185);
    for (BucketingVersion version : BucketingVersion.values()) {
        List<TypeInfo> typeInfos = ImmutableList.of(timestampTypeInfo);
        assertThatThrownBy(() -> getBucketHashCode(version, typeInfos, new Object[] { 0 })).hasMessage("Computation of Hive bucket hashCode is not supported for Hive primitive category: TIMESTAMP");
        TimestampType timestampType = TimestampType.TIMESTAMP;
        BlockBuilder builder = timestampType.createBlockBuilder(null, 1);
        timestampType.writeLong(builder, 0);
        Page page = new Page(builder.build());
        assertThatThrownBy(() -> getBucketHashCode(version, typeInfos, page, 0)).hasMessage("Computation of Hive bucket hashCode is not supported for Hive primitive category: TIMESTAMP");
    }
    assertBucketEquals("array<double>", null, 0, 0);
    assertBucketEquals("array<boolean>", ImmutableList.of(), 0, 0);
    assertBucketEquals("array<smallint>", ImmutableList.of((short) 5, (short) 8, (short) 13), 5066, -905011156);
    assertBucketEquals("array<string>", ImmutableList.of("test1", "test2", "test3", "test4"), 957612994, 1305539282);
    assertBucketEquals("array<array<bigint>>", ImmutableList.of(ImmutableList.of(10L, 20L), ImmutableList.of(-10L, -20L), asList((Object) null)), 326368, 611324477);
    assertBucketEquals("map<float,date>", null, 0, 0);
    assertBucketEquals("map<double,timestamp>", ImmutableMap.of(), 0, 0);
    assertBucketEquals("map<string,bigint>", ImmutableMap.of("key", 123L, "key2", 123456789L, "key3", -123456L), 127880789, -1910999650);
    assertBucketEquals("map<array<double>,map<int,string>>", ImmutableMap.of(ImmutableList.of(12.3, 45.7), ImmutableMap.of(123, "test99")), -34001111, -1565874874);
    // multiple bucketing columns
    assertBucketEquals(ImmutableList.of("float", "array<smallint>", "map<string,bigint>"), ImmutableList.of(12.34F, ImmutableList.of((short) 5, (short) 8, (short) 13), ImmutableMap.of("key", 123L)), 95411006, 932898434);
    assertBucketEquals(ImmutableList.of("double", "array<smallint>", "boolean", "map<string,bigint>", "tinyint"), asList(null, ImmutableList.of((short) 5, (short) 8, (short) 13), null, ImmutableMap.of("key", 123L), null), 154207826, -1120812524);
}
Also used : BucketingVersion(io.prestosql.plugin.hive.HiveBucketing.BucketingVersion) TimestampType(io.prestosql.spi.type.TimestampType) Page(io.prestosql.spi.Page) TypeInfo(org.apache.hadoop.hive.serde2.typeinfo.TypeInfo) TypeInfoFactory.timestampTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory.timestampTypeInfo) BlockBuilder(io.prestosql.spi.block.BlockBuilder) Test(org.testng.annotations.Test)

Example 3 with BucketingVersion

use of io.prestosql.plugin.hive.HiveBucketing.BucketingVersion in project hetu-core by openlookeng.

the class HiveTableProperties method getBucketProperty.

public static Optional<HiveBucketProperty> getBucketProperty(Map<String, Object> tableProperties) {
    List<String> bucketedBy = getBucketedBy(tableProperties);
    List<SortingColumn> sortedBy = getSortedBy(tableProperties);
    int bucketCount = (Integer) tableProperties.get(BUCKET_COUNT_PROPERTY);
    if ((bucketedBy.isEmpty()) && (bucketCount == 0)) {
        if (!sortedBy.isEmpty()) {
            throw new PrestoException(INVALID_TABLE_PROPERTY, format("%s may be specified only when %s is specified", SORTED_BY_PROPERTY, BUCKETED_BY_PROPERTY));
        }
        return Optional.empty();
    }
    if (bucketCount < 0) {
        throw new PrestoException(INVALID_TABLE_PROPERTY, format("%s must be greater than zero", BUCKET_COUNT_PROPERTY));
    }
    if (bucketedBy.isEmpty() || bucketCount == 0) {
        throw new PrestoException(INVALID_TABLE_PROPERTY, format("%s and %s must be specified together", BUCKETED_BY_PROPERTY, BUCKET_COUNT_PROPERTY));
    }
    BucketingVersion bucketingVersion = getBucketingVersion(tableProperties);
    return Optional.of(new HiveBucketProperty(bucketedBy, bucketingVersion, bucketCount, sortedBy));
}
Also used : BucketingVersion(io.prestosql.plugin.hive.HiveBucketing.BucketingVersion) SortingColumn(io.prestosql.plugin.hive.metastore.SortingColumn) PrestoException(io.prestosql.spi.PrestoException)

Example 4 with BucketingVersion

use of io.prestosql.plugin.hive.HiveBucketing.BucketingVersion in project hetu-core by openlookeng.

the class BackgroundHiveSplitLoader method loadPartition.

private ListenableFuture<?> loadPartition(HivePartitionMetadata partition) throws IOException {
    HivePartition hivePartition = partition.getHivePartition();
    String partitionName = hivePartition.getPartitionId();
    Properties schema = getPartitionSchema(table, partition.getPartition());
    List<HivePartitionKey> partitionKeys = getPartitionKeys(table, partition.getPartition());
    TupleDomain<HiveColumnHandle> effectivePredicate = (TupleDomain<HiveColumnHandle>) compactEffectivePredicate;
    if (dynamicFilterSupplier != null && isDynamicFilteringSplitFilteringEnabled(session)) {
        if (isPartitionFiltered(partitionKeys, dynamicFilterSupplier.get(), typeManager)) {
            // Avoid listing files and creating splits from a partition if it has been pruned due to dynamic filters
            return COMPLETED_FUTURE;
        }
    }
    Path path = new Path(getPartitionLocation(table, partition.getPartition()));
    InputFormat<?, ?> inputFormat = getInputFormat(configuration, schema, false, jobConf);
    FileSystem fs = hdfsEnvironment.getFileSystem(hdfsContext, path);
    boolean s3SelectPushdownEnabled = shouldEnablePushdownForTable(session, table, path.toString(), partition.getPartition());
    if (inputFormat instanceof SymlinkTextInputFormat) {
        if (tableBucketInfo.isPresent()) {
            throw new PrestoException(NOT_SUPPORTED, "Bucketed table in SymlinkTextInputFormat is not yet supported");
        }
        // TODO: This should use an iterator like the HiveFileIterator
        ListenableFuture<?> lastResult = COMPLETED_FUTURE;
        for (Path targetPath : getTargetPathsFromSymlink(fs, path)) {
            // The input should be in TextInputFormat.
            TextInputFormat targetInputFormat = new TextInputFormat();
            // the splits must be generated using the file system for the target path
            // get the configuration for the target path -- it may be a different hdfs instance
            FileSystem targetFilesystem = hdfsEnvironment.getFileSystem(hdfsContext, targetPath);
            jobConf.setInputFormat(TextInputFormat.class);
            targetInputFormat.configure(jobConf);
            FileInputFormat.setInputPaths(jobConf, targetPath);
            InputSplit[] targetSplits = targetInputFormat.getSplits(jobConf, 0);
            InternalHiveSplitFactory splitFactory = new InternalHiveSplitFactory(targetFilesystem, partitionName, inputFormat, schema, partitionKeys, effectivePredicate, partition.getColumnCoercions(), Optional.empty(), isForceLocalScheduling(session), s3SelectPushdownEnabled);
            lastResult = addSplitsToSource(targetSplits, splitFactory);
            if (stopped) {
                return COMPLETED_FUTURE;
            }
        }
        return lastResult;
    }
    Optional<BucketConversion> bucketConversion = Optional.empty();
    boolean bucketConversionRequiresWorkerParticipation = false;
    if (partition.getPartition().isPresent()) {
        Optional<HiveBucketProperty> partitionBucketProperty = partition.getPartition().get().getStorage().getBucketProperty();
        if (tableBucketInfo.isPresent() && partitionBucketProperty.isPresent()) {
            int readBucketCount = tableBucketInfo.get().getReadBucketCount();
            // TODO can partition's bucketing_version be different from table's?
            BucketingVersion bucketingVersion = partitionBucketProperty.get().getBucketingVersion();
            int partitionBucketCount = partitionBucketProperty.get().getBucketCount();
            // Here, it's just trying to see if its needs the BucketConversion.
            if (readBucketCount != partitionBucketCount) {
                bucketConversion = Optional.of(new BucketConversion(bucketingVersion, readBucketCount, partitionBucketCount, tableBucketInfo.get().getBucketColumns()));
                if (readBucketCount > partitionBucketCount) {
                    bucketConversionRequiresWorkerParticipation = true;
                }
            }
        }
    }
    InternalHiveSplitFactory splitFactory = new InternalHiveSplitFactory(fs, partitionName, inputFormat, schema, partitionKeys, effectivePredicate, partition.getColumnCoercions(), bucketConversionRequiresWorkerParticipation ? bucketConversion : Optional.empty(), isForceLocalScheduling(session), s3SelectPushdownEnabled);
    // on the input format to obtain file splits.
    if (!isHudiParquetInputFormat(inputFormat) && shouldUseFileSplitsFromInputFormat(inputFormat)) {
        if (tableBucketInfo.isPresent()) {
            throw new PrestoException(NOT_SUPPORTED, "Presto cannot read bucketed partition in an input format with UseFileSplitsFromInputFormat annotation: " + inputFormat.getClass().getSimpleName());
        }
        if (AcidUtils.isTransactionalTable(table.getParameters())) {
            throw new PrestoException(NOT_SUPPORTED, "Hive transactional tables in an input format with UseFileSplitsFromInputFormat annotation are not supported: " + inputFormat.getClass().getSimpleName());
        }
        FileInputFormat.setInputPaths(jobConf, path);
        InputSplit[] splits = inputFormat.getSplits(jobConf, 0);
        return addSplitsToSource(splits, splitFactory);
    }
    PathFilter pathFilter = isHudiParquetInputFormat(inputFormat) ? hoodiePathFilterSupplier.get() : path1 -> true;
    // S3 Select pushdown works at the granularity of individual S3 objects,
    // therefore we must not split files when it is enabled.
    boolean splittable = getHeaderCount(schema) == 0 && getFooterCount(schema) == 0 && !s3SelectPushdownEnabled;
    List<Path> readPaths;
    Optional<DeleteDeltaLocations> deleteDeltaLocations;
    long min = Long.MAX_VALUE;
    long max = Long.MIN_VALUE;
    if (AcidUtils.isTransactionalTable(table.getParameters())) {
        boolean isVacuum = queryType.map(type -> type == QueryType.VACUUM).orElse(false);
        AcidUtils.Directory directory = hdfsEnvironment.doAs(hdfsContext.getIdentity().getUser(), () -> {
            ValidWriteIdList writeIdList = validWriteIds.orElseThrow(() -> new IllegalStateException("No validWriteIds present"));
            if (isVacuum) {
                writeIdList = new ValidCompactorWriteIdList(writeIdList.writeToString()) {

                    @Override
                    public RangeResponse isWriteIdRangeValid(long minWriteId, long maxWriteId) {
                        // For unknown reasons.. ValidCompactorWriteIdList#isWriteIdRangeValid() doesnot
                        // check for aborted transactions and AcidUtils.getAcidState() adds aborted transaction to both aborted and working lists.
                        // Avoid this by overriding.
                        RangeResponse writeIdRangeValid = super.isWriteIdRangeValid(minWriteId, maxWriteId);
                        if (writeIdRangeValid == RangeResponse.NONE) {
                            return RangeResponse.NONE;
                        } else if (super.isWriteIdRangeAborted(minWriteId, maxWriteId) == RangeResponse.ALL) {
                            return RangeResponse.NONE;
                        }
                        return writeIdRangeValid;
                    }
                };
            }
            return AcidUtils.getAcidState(path, configuration, writeIdList, Ref.from(false), true, table.getParameters());
        });
        if (AcidUtils.isFullAcidTable(table.getParameters())) {
            // From Hive version >= 3.0, delta/base files will always have file '_orc_acid_version' with value >= '2'.
            Path baseOrDeltaPath = directory.getBaseDirectory() != null ? directory.getBaseDirectory() : (directory.getCurrentDirectories().size() > 0 ? directory.getCurrentDirectories().get(0).getPath() : null);
            if (baseOrDeltaPath != null && AcidUtils.OrcAcidVersion.getAcidVersionFromMetaFile(baseOrDeltaPath, fs) < 2) {
                throw new PrestoException(NOT_SUPPORTED, "Hive transactional tables are supported with Hive 3.0 and only after a major compaction has been run");
            }
        }
        readPaths = new ArrayList<>();
        boolean isFullVacuum = isVacuum ? Boolean.valueOf(queryInfo.get("FULL").toString()) : false;
        if (isFullVacuum) {
            // Base will contain everything
            min = 0;
        }
        // In case of vacuum, include only in case of Full vacuum.
        if (directory.getBaseDirectory() != null && (!isVacuum || isFullVacuum)) {
            readPaths.add(directory.getBaseDirectory());
            if (isVacuum) {
                min = 0;
                max = AcidUtils.parseBase(directory.getBaseDirectory());
            }
        }
        // delta directories
        for (AcidUtils.ParsedDelta delta : directory.getCurrentDirectories()) {
            if (!delta.isDeleteDelta()) {
                readPaths.add(delta.getPath());
            } else // In case of Minor compaction, all delete_delta files should be compacted separately,
            if (isVacuum && !isFullVacuum) {
                readPaths.add(delta.getPath());
            }
            if (isVacuum) {
                min = Math.min(delta.getMinWriteId(), min);
                max = Math.max(delta.getMaxWriteId(), max);
            }
        }
        // Create a registry of delete_delta directories for the partition
        DeleteDeltaLocations.Builder deleteDeltaLocationsBuilder = DeleteDeltaLocations.builder(path);
        for (AcidUtils.ParsedDelta delta : directory.getCurrentDirectories()) {
            // In case of minor compaction, delete_delta directories should not be used for masking.
            if (delta.isDeleteDelta() && (!isVacuum || isFullVacuum)) {
                // For unknown reasons ParseDelta.getStatementId() returns 0, though parsed statement is -1;
                // This creates issue while trying to locate the delete_delta directory.
                // So parsing again.
                OptionalInt statementId = getStatementId(delta.getPath().getName());
                int stmtId = statementId.orElse(0);
                deleteDeltaLocationsBuilder.addDeleteDelta(delta.getPath(), delta.getMinWriteId(), delta.getMaxWriteId(), stmtId);
            }
        }
        deleteDeltaLocations = deleteDeltaLocationsBuilder.build();
        if (!directory.getOriginalFiles().isEmpty()) {
            LOG.info("Now supporting read from non-ACID files in ACID reader");
            // non-ACID file
            int numberOfBuckets = Integer.parseInt(schema.getProperty("bucket_count"));
            long[] bucketStartRowOffset = new long[Integer.max(numberOfBuckets, 1)];
            for (HadoopShims.HdfsFileStatusWithId f : directory.getOriginalFiles()) {
                Path currFilePath = f.getFileStatus().getPath();
                int currBucketNumber = getBucketNumber(currFilePath.getName()).getAsInt();
                fileIterators.addLast(createInternalHiveSplitIterator(currFilePath, fs, splitFactory, splittable, deleteDeltaLocations, Optional.of(bucketStartRowOffset[currBucketNumber]), pathFilter));
                try {
                    Reader copyReader = OrcFile.createReader(f.getFileStatus().getPath(), OrcFile.readerOptions(configuration));
                    bucketStartRowOffset[currBucketNumber] += copyReader.getNumberOfRows();
                } catch (Exception e) {
                    throw new PrestoException(NOT_SUPPORTED, e.getMessage());
                }
            }
        }
        if (isVacuum && !readPaths.isEmpty()) {
            Object vacuumHandle = queryInfo.get("vacuumHandle");
            if (vacuumHandle != null && vacuumHandle instanceof HiveVacuumTableHandle) {
                HiveVacuumTableHandle hiveVacuumTableHandle = (HiveVacuumTableHandle) vacuumHandle;
                hiveVacuumTableHandle.addRange(partitionName, new Range(min, max));
            }
        }
    } else {
        readPaths = ImmutableList.of(path);
        deleteDeltaLocations = Optional.empty();
    }
    // Bucketed partitions are fully loaded immediately since all files must be loaded to determine the file to bucket mapping
    if (tableBucketInfo.isPresent()) {
        // TODO document in addToQueue() that it is sufficient to hold on to last returned future
        ListenableFuture<?> lastResult = immediateFuture(null);
        for (Path readPath : readPaths) {
            lastResult = hiveSplitSource.addToQueue(getBucketedSplits(readPath, fs, splitFactory, tableBucketInfo.get(), bucketConversion, getDeleteDeltaLocationFor(readPath, deleteDeltaLocations), pathFilter));
        }
        return lastResult;
    }
    for (Path readPath : readPaths) {
        fileIterators.addLast(createInternalHiveSplitIterator(readPath, fs, splitFactory, splittable, getDeleteDeltaLocationFor(readPath, deleteDeltaLocations), Optional.empty(), pathFilter));
    }
    return COMPLETED_FUTURE;
}
Also used : ArrayListMultimap(com.google.common.collect.ArrayListMultimap) Arrays(java.util.Arrays) ListMultimap(com.google.common.collect.ListMultimap) FileSystem(org.apache.hadoop.fs.FileSystem) Range(io.prestosql.plugin.hive.HiveVacuumTableHandle.Range) FileStatus(org.apache.hadoop.fs.FileStatus) FileSplit(org.apache.hadoop.mapred.FileSplit) Matcher(java.util.regex.Matcher) BucketingVersion(io.prestosql.plugin.hive.HiveBucketing.BucketingVersion) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) HiveUtil.getFooterCount(io.prestosql.plugin.hive.HiveUtil.getFooterCount) HiveSessionProperties.isForceLocalScheduling(io.prestosql.plugin.hive.HiveSessionProperties.isForceLocalScheduling) InternalHiveSplitFactory(io.prestosql.plugin.hive.util.InternalHiveSplitFactory) HadoopShims(org.apache.hadoop.hive.shims.HadoopShims) HiveUtil.getHeaderCount(io.prestosql.plugin.hive.HiveUtil.getHeaderCount) Set(java.util.Set) StandardCharsets(java.nio.charset.StandardCharsets) FAIL(io.prestosql.plugin.hive.util.HiveFileIterator.NestedDirectoryPolicy.FAIL) Table(io.prestosql.plugin.hive.metastore.Table) ResumableTask(io.prestosql.plugin.hive.util.ResumableTask) Partition(io.prestosql.plugin.hive.metastore.Partition) ReentrantReadWriteLock(java.util.concurrent.locks.ReentrantReadWriteLock) Supplier(java.util.function.Supplier) ArrayList(java.util.ArrayList) HiveFileIterator(io.prestosql.plugin.hive.util.HiveFileIterator) MetastoreUtil.getPartitionLocation(io.prestosql.plugin.hive.metastore.MetastoreUtil.getPartitionLocation) Futures.immediateFuture(com.google.common.util.concurrent.Futures.immediateFuture) Properties(java.util.Properties) Executor(java.util.concurrent.Executor) TypeManager(io.prestosql.spi.type.TypeManager) HoodieROTablePathFilter(org.apache.hudi.hadoop.HoodieROTablePathFilter) IOException(java.io.IOException) InputStreamReader(java.io.InputStreamReader) ValidCompactorWriteIdList(org.apache.hadoop.hive.common.ValidCompactorWriteIdList) MetastoreUtil.getHiveSchema(io.prestosql.plugin.hive.metastore.MetastoreUtil.getHiveSchema) ColumnHandle(io.prestosql.spi.connector.ColumnHandle) InputSplit(org.apache.hadoop.mapred.InputSplit) ConfigurationUtils(io.prestosql.plugin.hive.util.ConfigurationUtils) BufferedReader(java.io.BufferedReader) OrcFile(org.apache.hadoop.hive.ql.io.orc.OrcFile) ValidWriteIdList(org.apache.hadoop.hive.common.ValidWriteIdList) DynamicFilter(io.prestosql.spi.dynamicfilter.DynamicFilter) QueryType(io.prestosql.spi.resourcegroups.QueryType) HdfsContext(io.prestosql.plugin.hive.HdfsEnvironment.HdfsContext) HoodieParquetRealtimeInputFormat(org.apache.hudi.hadoop.realtime.HoodieParquetRealtimeInputFormat) IntPredicate(java.util.function.IntPredicate) Preconditions.checkArgument(com.google.common.base.Preconditions.checkArgument) ConnectorSession(io.prestosql.spi.connector.ConnectorSession) CharStreams(com.google.common.io.CharStreams) InputFormat(org.apache.hadoop.mapred.InputFormat) Path(org.apache.hadoop.fs.Path) Reader(org.apache.hadoop.hive.ql.io.orc.Reader) PrestoException(io.prestosql.spi.PrestoException) HiveSessionProperties.isDynamicFilteringSplitFilteringEnabled(io.prestosql.plugin.hive.HiveSessionProperties.isDynamicFilteringSplitFilteringEnabled) FileInputFormat(org.apache.hadoop.mapred.FileInputFormat) LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) Streams(com.google.common.collect.Streams) Collectors(java.util.stream.Collectors) String.format(java.lang.String.format) Preconditions.checkState(com.google.common.base.Preconditions.checkState) RECURSE(io.prestosql.plugin.hive.util.HiveFileIterator.NestedDirectoryPolicy.RECURSE) List(java.util.List) Annotation(java.lang.annotation.Annotation) HIDDEN_FILES_PATH_FILTER(org.apache.hadoop.hive.common.FileUtils.HIDDEN_FILES_PATH_FILTER) Optional(java.util.Optional) Math.max(java.lang.Math.max) NOT_SUPPORTED(io.prestosql.spi.StandardErrorCode.NOT_SUPPORTED) Pattern(java.util.regex.Pattern) IGNORED(io.prestosql.plugin.hive.util.HiveFileIterator.NestedDirectoryPolicy.IGNORED) AcidUtils(org.apache.hadoop.hive.ql.io.AcidUtils) TextInputFormat(org.apache.hadoop.mapred.TextInputFormat) ListenableFuture(com.google.common.util.concurrent.ListenableFuture) Logger(io.airlift.log.Logger) PathFilter(org.apache.hadoop.fs.PathFilter) Deque(java.util.Deque) OptionalInt(java.util.OptionalInt) HiveUtil.getInputFormat(io.prestosql.plugin.hive.HiveUtil.getInputFormat) Iterators(com.google.common.collect.Iterators) SymlinkTextInputFormat(org.apache.hadoop.hive.ql.io.SymlinkTextInputFormat) ImmutableList(com.google.common.collect.ImmutableList) HiveUtil.checkCondition(io.prestosql.plugin.hive.HiveUtil.checkCondition) Objects.requireNonNull(java.util.Objects.requireNonNull) Suppliers(com.google.common.base.Suppliers) HoodieParquetInputFormat(org.apache.hudi.hadoop.HoodieParquetInputFormat) S3SelectPushdown.shouldEnablePushdownForTable(io.prestosql.plugin.hive.S3SelectPushdown.shouldEnablePushdownForTable) Ref(org.apache.hive.common.util.Ref) Iterator(java.util.Iterator) ResumableTasks(io.prestosql.plugin.hive.util.ResumableTasks) TupleDomain(io.prestosql.spi.predicate.TupleDomain) BucketConversion(io.prestosql.plugin.hive.HiveSplit.BucketConversion) NestedDirectoryNotAllowedException(io.prestosql.plugin.hive.util.HiveFileIterator.NestedDirectoryNotAllowedException) HiveUtil.isPartitionFiltered(io.prestosql.plugin.hive.HiveUtil.isPartitionFiltered) HiveUtil.getBucketNumber(io.prestosql.plugin.hive.HiveUtil.getBucketNumber) ConcurrentLinkedDeque(java.util.concurrent.ConcurrentLinkedDeque) JobConf(org.apache.hadoop.mapred.JobConf) Column(io.prestosql.plugin.hive.metastore.Column) FileSystem(org.apache.hadoop.fs.FileSystem) InternalHiveSplitFactory(io.prestosql.plugin.hive.util.InternalHiveSplitFactory) Range(io.prestosql.plugin.hive.HiveVacuumTableHandle.Range) HadoopShims(org.apache.hadoop.hive.shims.HadoopShims) TupleDomain(io.prestosql.spi.predicate.TupleDomain) AcidUtils(org.apache.hadoop.hive.ql.io.AcidUtils) HoodieROTablePathFilter(org.apache.hudi.hadoop.HoodieROTablePathFilter) PathFilter(org.apache.hadoop.fs.PathFilter) InputStreamReader(java.io.InputStreamReader) BufferedReader(java.io.BufferedReader) Reader(org.apache.hadoop.hive.ql.io.orc.Reader) PrestoException(io.prestosql.spi.PrestoException) Properties(java.util.Properties) SymlinkTextInputFormat(org.apache.hadoop.hive.ql.io.SymlinkTextInputFormat) BucketingVersion(io.prestosql.plugin.hive.HiveBucketing.BucketingVersion) InputSplit(org.apache.hadoop.mapred.InputSplit) Path(org.apache.hadoop.fs.Path) OptionalInt(java.util.OptionalInt) IOException(java.io.IOException) PrestoException(io.prestosql.spi.PrestoException) NestedDirectoryNotAllowedException(io.prestosql.plugin.hive.util.HiveFileIterator.NestedDirectoryNotAllowedException) ValidCompactorWriteIdList(org.apache.hadoop.hive.common.ValidCompactorWriteIdList) TextInputFormat(org.apache.hadoop.mapred.TextInputFormat) SymlinkTextInputFormat(org.apache.hadoop.hive.ql.io.SymlinkTextInputFormat) ValidWriteIdList(org.apache.hadoop.hive.common.ValidWriteIdList) BucketConversion(io.prestosql.plugin.hive.HiveSplit.BucketConversion)

Example 5 with BucketingVersion

use of io.prestosql.plugin.hive.HiveBucketing.BucketingVersion in project hetu-core by openlookeng.

the class HiveBucketProperty method fromStorageDescriptor.

public static Optional<HiveBucketProperty> fromStorageDescriptor(Map<String, String> tableParameters, StorageDescriptor storageDescriptor, String tablePartitionName) {
    boolean bucketColsSet = storageDescriptor.isSetBucketCols() && !storageDescriptor.getBucketCols().isEmpty();
    boolean numBucketsSet = storageDescriptor.isSetNumBuckets() && storageDescriptor.getNumBuckets() > 0;
    if (!numBucketsSet) {
        // In Hive, a table is considered as not bucketed when its bucketCols is set but its numBucket is not set.
        return Optional.empty();
    }
    if (!bucketColsSet) {
        throw new PrestoException(HIVE_INVALID_METADATA, "Table/partition metadata has 'numBuckets' set, but 'bucketCols' is not set: " + tablePartitionName);
    }
    List<SortingColumn> localSortedBy = ImmutableList.of();
    if (storageDescriptor.isSetSortCols()) {
        localSortedBy = storageDescriptor.getSortCols().stream().map(order -> SortingColumn.fromMetastoreApiOrder(order, tablePartitionName)).collect(toImmutableList());
    }
    BucketingVersion localBucketingVersion = HiveBucketing.getBucketingVersion(tableParameters);
    return Optional.of(new HiveBucketProperty(storageDescriptor.getBucketCols(), localBucketingVersion, storageDescriptor.getNumBuckets(), localSortedBy));
}
Also used : BucketingVersion(io.prestosql.plugin.hive.HiveBucketing.BucketingVersion) SortingColumn(io.prestosql.plugin.hive.metastore.SortingColumn) PrestoException(io.prestosql.spi.PrestoException)

Aggregations

BucketingVersion (io.prestosql.plugin.hive.HiveBucketing.BucketingVersion)5 PrestoException (io.prestosql.spi.PrestoException)4 SortingColumn (io.prestosql.plugin.hive.metastore.SortingColumn)3 SerDeInfo (com.amazonaws.services.glue.model.SerDeInfo)1 Preconditions.checkArgument (com.google.common.base.Preconditions.checkArgument)1 Preconditions.checkState (com.google.common.base.Preconditions.checkState)1 Suppliers (com.google.common.base.Suppliers)1 ArrayListMultimap (com.google.common.collect.ArrayListMultimap)1 ImmutableList (com.google.common.collect.ImmutableList)1 Iterators (com.google.common.collect.Iterators)1 ListMultimap (com.google.common.collect.ListMultimap)1 Streams (com.google.common.collect.Streams)1 CharStreams (com.google.common.io.CharStreams)1 Futures.immediateFuture (com.google.common.util.concurrent.Futures.immediateFuture)1 ListenableFuture (com.google.common.util.concurrent.ListenableFuture)1 Logger (io.airlift.log.Logger)1 HdfsContext (io.prestosql.plugin.hive.HdfsEnvironment.HdfsContext)1 HiveBucketProperty (io.prestosql.plugin.hive.HiveBucketProperty)1 HiveSessionProperties.isDynamicFilteringSplitFilteringEnabled (io.prestosql.plugin.hive.HiveSessionProperties.isDynamicFilteringSplitFilteringEnabled)1 HiveSessionProperties.isForceLocalScheduling (io.prestosql.plugin.hive.HiveSessionProperties.isForceLocalScheduling)1