use of io.prestosql.plugin.hive.HiveBucketing.BucketingVersion in project hetu-core by openlookeng.
the class GlueToPrestoConverter method setStorageBuilder.
private static void setStorageBuilder(StorageDescriptor sd, Storage.Builder storageBuilder) {
requireNonNull(sd.getSerdeInfo(), "StorageDescriptor SerDeInfo is null");
SerDeInfo serdeInfo = sd.getSerdeInfo();
Optional<HiveBucketProperty> bucketProperty = Optional.empty();
if (sd.getNumberOfBuckets() > 0) {
if (isNullOrEmpty(sd.getBucketColumns())) {
throw new PrestoException(HiveErrorCode.HIVE_INVALID_METADATA, "Table/partition metadata has 'numBuckets' set, but 'bucketCols' is not set");
}
List<SortingColumn> sortedBy = ImmutableList.of();
if (!isNullOrEmpty(sd.getSortColumns())) {
sortedBy = sd.getSortColumns().stream().map(column -> new SortingColumn(column.getColumn(), SortingColumn.Order.fromMetastoreApiOrder(column.getSortOrder(), "unknown"))).collect(toImmutableList());
}
BucketingVersion bucketingVersion = HiveBucketing.getBucketingVersion(sd.getParameters());
bucketProperty = Optional.of(new HiveBucketProperty(sd.getBucketColumns(), bucketingVersion, sd.getNumberOfBuckets(), sortedBy));
}
storageBuilder.setStorageFormat(StorageFormat.createNullable(serdeInfo.getSerializationLibrary(), sd.getInputFormat(), sd.getOutputFormat())).setLocation(nullToEmpty(sd.getLocation())).setBucketProperty(bucketProperty).setSkewed(sd.getSkewedInfo() != null && !isNullOrEmpty(sd.getSkewedInfo().getSkewedColumnNames())).setSerdeParameters(firstNonNull(serdeInfo.getParameters(), ImmutableMap.of())).build();
}
use of io.prestosql.plugin.hive.HiveBucketing.BucketingVersion in project hetu-core by openlookeng.
the class TestHiveBucketing method testHashingCompare.
@Test
public void testHashingCompare() {
assertBucketEquals("boolean", null, 0, 0);
assertBucketEquals("boolean", true, 1, 1);
assertBucketEquals("boolean", false, 0, 0);
assertBucketEquals("tinyint", null, 0, 0);
assertBucketEquals("tinyint", (byte) 5, 5, 5);
assertBucketEquals("tinyint", Byte.MIN_VALUE, -128, -128);
assertBucketEquals("tinyint", Byte.MAX_VALUE, 127, 127);
assertBucketEquals("smallint", null, 0, 0);
assertBucketEquals("smallint", (short) 300, 300, 2107031704);
assertBucketEquals("smallint", Short.MIN_VALUE, -32768, 1342976838);
assertBucketEquals("smallint", Short.MAX_VALUE, 32767, -684075052);
assertBucketEquals("int", null, 0, 0);
assertBucketEquals("int", 300_000, 300000, -678663480);
assertBucketEquals("int", Integer.MIN_VALUE, -2147483648, 1194881028);
assertBucketEquals("int", Integer.MAX_VALUE, 2147483647, 1133859967);
assertBucketEquals("bigint", null, 0, 0);
assertBucketEquals("bigint", 300_000_000_000L, -647710651, -888935297);
assertBucketEquals("bigint", Long.MIN_VALUE, -2147483648, 1728983947);
assertBucketEquals("bigint", Long.MAX_VALUE, -2147483648, -536577852);
assertBucketEquals("float", null, 0, 0);
assertBucketEquals("float", 12.34F, 1095069860, -381747602);
assertBucketEquals("float", -Float.MAX_VALUE, -8388609, 470252243);
assertBucketEquals("float", Float.MIN_VALUE, 1, 1206721797);
assertBucketEquals("float", Float.POSITIVE_INFINITY, 2139095040, -292175804);
assertBucketEquals("float", Float.NEGATIVE_INFINITY, -8388608, -1433270801);
assertBucketEquals("float", Float.NaN, 2143289344, -480354314);
// also a NaN
assertBucketEquals("float", intBitsToFloat(0xffc00000), 2143289344, -480354314);
// also a NaN
assertBucketEquals("float", intBitsToFloat(0x7fc00000), 2143289344, -480354314);
// also a NaN
assertBucketEquals("float", intBitsToFloat(0x7fc01234), 2143289344, -480354314);
// also a NaN
assertBucketEquals("float", intBitsToFloat(0xffc01234), 2143289344, -480354314);
assertBucketEquals("double", null, 0, 0);
assertBucketEquals("double", 12.34, 986311098, -2070733568);
assertBucketEquals("double", -Double.MAX_VALUE, 1048576, 14392725);
assertBucketEquals("double", Double.MIN_VALUE, 1, -8838199);
assertBucketEquals("double", Double.POSITIVE_INFINITY, 2146435072, 1614292060);
assertBucketEquals("double", Double.NEGATIVE_INFINITY, -1048576, 141388605);
assertBucketEquals("double", Double.NaN, 2146959360, 1138026565);
// also a NaN
assertBucketEquals("double", longBitsToDouble(0xfff8000000000000L), 2146959360, 1138026565);
// also a NaN
assertBucketEquals("double", longBitsToDouble(0x7ff8123412341234L), 2146959360, 1138026565);
// also a NaN
assertBucketEquals("double", longBitsToDouble(0xfff8123412341234L), 2146959360, 1138026565);
assertBucketEquals("varchar(15)", null, 0, 0);
assertBucketEquals("varchar(15)", "", 1, -965378730);
assertBucketEquals("varchar(15)", "test string", -189841218, -138301454);
// 3-byte UTF-8 sequences (in Basic Plane, i.e. Plane 0)
assertBucketEquals("varchar(15)", "\u5f3a\u5927\u7684Hetu\u5f15\u64ce", 481023052, 1436831192);
// 4 code points: 20FFC - 20FFF. 4-byte UTF-8 sequences in Supplementary Plane 2
assertBucketEquals("varchar(15)", "\uD843\uDFFC\uD843\uDFFD\uD843\uDFFE\uD843\uDFFF", -457487557, -697348811);
assertBucketEquals("string", null, 0, 0);
assertBucketEquals("string", "", 0, -965378730);
assertBucketEquals("string", "test string", -318923937, -138301454);
// 3-byte UTF-8 sequences (in Basic Plane, i.e. Plane 0)
assertBucketEquals("string", "\u5f3a\u5927\u7684Hetu\u5f15\u64ce", 889847277, 1436831192);
// 4 code points: 20FFC - 20FFF. 4-byte UTF-8 sequences in Supplementary Plane 2
assertBucketEquals("string", "\uD843\uDFFC\uD843\uDFFD\uD843\uDFFE\uD843\uDFFF", -1810797254, -697348811);
assertBucketEquals("char(6)", null, 0, 0);
assertBucketEquals("char(6)", "", 1, -965378730);
assertBucketEquals("char(6)", "test_1", 10333957, 1284522943);
assertBucketEquals("date", null, 0, 0);
assertBucketEquals("date", Date.valueOf("1970-01-01"), 0, 1362653161);
assertBucketEquals("date", Date.valueOf("2015-11-19"), 16758, 8542395);
assertBucketEquals("date", Date.valueOf("1950-11-19"), -6983, -431619185);
for (BucketingVersion version : BucketingVersion.values()) {
List<TypeInfo> typeInfos = ImmutableList.of(timestampTypeInfo);
assertThatThrownBy(() -> getBucketHashCode(version, typeInfos, new Object[] { 0 })).hasMessage("Computation of Hive bucket hashCode is not supported for Hive primitive category: TIMESTAMP");
TimestampType timestampType = TimestampType.TIMESTAMP;
BlockBuilder builder = timestampType.createBlockBuilder(null, 1);
timestampType.writeLong(builder, 0);
Page page = new Page(builder.build());
assertThatThrownBy(() -> getBucketHashCode(version, typeInfos, page, 0)).hasMessage("Computation of Hive bucket hashCode is not supported for Hive primitive category: TIMESTAMP");
}
assertBucketEquals("array<double>", null, 0, 0);
assertBucketEquals("array<boolean>", ImmutableList.of(), 0, 0);
assertBucketEquals("array<smallint>", ImmutableList.of((short) 5, (short) 8, (short) 13), 5066, -905011156);
assertBucketEquals("array<string>", ImmutableList.of("test1", "test2", "test3", "test4"), 957612994, 1305539282);
assertBucketEquals("array<array<bigint>>", ImmutableList.of(ImmutableList.of(10L, 20L), ImmutableList.of(-10L, -20L), asList((Object) null)), 326368, 611324477);
assertBucketEquals("map<float,date>", null, 0, 0);
assertBucketEquals("map<double,timestamp>", ImmutableMap.of(), 0, 0);
assertBucketEquals("map<string,bigint>", ImmutableMap.of("key", 123L, "key2", 123456789L, "key3", -123456L), 127880789, -1910999650);
assertBucketEquals("map<array<double>,map<int,string>>", ImmutableMap.of(ImmutableList.of(12.3, 45.7), ImmutableMap.of(123, "test99")), -34001111, -1565874874);
// multiple bucketing columns
assertBucketEquals(ImmutableList.of("float", "array<smallint>", "map<string,bigint>"), ImmutableList.of(12.34F, ImmutableList.of((short) 5, (short) 8, (short) 13), ImmutableMap.of("key", 123L)), 95411006, 932898434);
assertBucketEquals(ImmutableList.of("double", "array<smallint>", "boolean", "map<string,bigint>", "tinyint"), asList(null, ImmutableList.of((short) 5, (short) 8, (short) 13), null, ImmutableMap.of("key", 123L), null), 154207826, -1120812524);
}
use of io.prestosql.plugin.hive.HiveBucketing.BucketingVersion in project hetu-core by openlookeng.
the class HiveTableProperties method getBucketProperty.
public static Optional<HiveBucketProperty> getBucketProperty(Map<String, Object> tableProperties) {
List<String> bucketedBy = getBucketedBy(tableProperties);
List<SortingColumn> sortedBy = getSortedBy(tableProperties);
int bucketCount = (Integer) tableProperties.get(BUCKET_COUNT_PROPERTY);
if ((bucketedBy.isEmpty()) && (bucketCount == 0)) {
if (!sortedBy.isEmpty()) {
throw new PrestoException(INVALID_TABLE_PROPERTY, format("%s may be specified only when %s is specified", SORTED_BY_PROPERTY, BUCKETED_BY_PROPERTY));
}
return Optional.empty();
}
if (bucketCount < 0) {
throw new PrestoException(INVALID_TABLE_PROPERTY, format("%s must be greater than zero", BUCKET_COUNT_PROPERTY));
}
if (bucketedBy.isEmpty() || bucketCount == 0) {
throw new PrestoException(INVALID_TABLE_PROPERTY, format("%s and %s must be specified together", BUCKETED_BY_PROPERTY, BUCKET_COUNT_PROPERTY));
}
BucketingVersion bucketingVersion = getBucketingVersion(tableProperties);
return Optional.of(new HiveBucketProperty(bucketedBy, bucketingVersion, bucketCount, sortedBy));
}
use of io.prestosql.plugin.hive.HiveBucketing.BucketingVersion in project hetu-core by openlookeng.
the class BackgroundHiveSplitLoader method loadPartition.
private ListenableFuture<?> loadPartition(HivePartitionMetadata partition) throws IOException {
HivePartition hivePartition = partition.getHivePartition();
String partitionName = hivePartition.getPartitionId();
Properties schema = getPartitionSchema(table, partition.getPartition());
List<HivePartitionKey> partitionKeys = getPartitionKeys(table, partition.getPartition());
TupleDomain<HiveColumnHandle> effectivePredicate = (TupleDomain<HiveColumnHandle>) compactEffectivePredicate;
if (dynamicFilterSupplier != null && isDynamicFilteringSplitFilteringEnabled(session)) {
if (isPartitionFiltered(partitionKeys, dynamicFilterSupplier.get(), typeManager)) {
// Avoid listing files and creating splits from a partition if it has been pruned due to dynamic filters
return COMPLETED_FUTURE;
}
}
Path path = new Path(getPartitionLocation(table, partition.getPartition()));
InputFormat<?, ?> inputFormat = getInputFormat(configuration, schema, false, jobConf);
FileSystem fs = hdfsEnvironment.getFileSystem(hdfsContext, path);
boolean s3SelectPushdownEnabled = shouldEnablePushdownForTable(session, table, path.toString(), partition.getPartition());
if (inputFormat instanceof SymlinkTextInputFormat) {
if (tableBucketInfo.isPresent()) {
throw new PrestoException(NOT_SUPPORTED, "Bucketed table in SymlinkTextInputFormat is not yet supported");
}
// TODO: This should use an iterator like the HiveFileIterator
ListenableFuture<?> lastResult = COMPLETED_FUTURE;
for (Path targetPath : getTargetPathsFromSymlink(fs, path)) {
// The input should be in TextInputFormat.
TextInputFormat targetInputFormat = new TextInputFormat();
// the splits must be generated using the file system for the target path
// get the configuration for the target path -- it may be a different hdfs instance
FileSystem targetFilesystem = hdfsEnvironment.getFileSystem(hdfsContext, targetPath);
jobConf.setInputFormat(TextInputFormat.class);
targetInputFormat.configure(jobConf);
FileInputFormat.setInputPaths(jobConf, targetPath);
InputSplit[] targetSplits = targetInputFormat.getSplits(jobConf, 0);
InternalHiveSplitFactory splitFactory = new InternalHiveSplitFactory(targetFilesystem, partitionName, inputFormat, schema, partitionKeys, effectivePredicate, partition.getColumnCoercions(), Optional.empty(), isForceLocalScheduling(session), s3SelectPushdownEnabled);
lastResult = addSplitsToSource(targetSplits, splitFactory);
if (stopped) {
return COMPLETED_FUTURE;
}
}
return lastResult;
}
Optional<BucketConversion> bucketConversion = Optional.empty();
boolean bucketConversionRequiresWorkerParticipation = false;
if (partition.getPartition().isPresent()) {
Optional<HiveBucketProperty> partitionBucketProperty = partition.getPartition().get().getStorage().getBucketProperty();
if (tableBucketInfo.isPresent() && partitionBucketProperty.isPresent()) {
int readBucketCount = tableBucketInfo.get().getReadBucketCount();
// TODO can partition's bucketing_version be different from table's?
BucketingVersion bucketingVersion = partitionBucketProperty.get().getBucketingVersion();
int partitionBucketCount = partitionBucketProperty.get().getBucketCount();
// Here, it's just trying to see if its needs the BucketConversion.
if (readBucketCount != partitionBucketCount) {
bucketConversion = Optional.of(new BucketConversion(bucketingVersion, readBucketCount, partitionBucketCount, tableBucketInfo.get().getBucketColumns()));
if (readBucketCount > partitionBucketCount) {
bucketConversionRequiresWorkerParticipation = true;
}
}
}
}
InternalHiveSplitFactory splitFactory = new InternalHiveSplitFactory(fs, partitionName, inputFormat, schema, partitionKeys, effectivePredicate, partition.getColumnCoercions(), bucketConversionRequiresWorkerParticipation ? bucketConversion : Optional.empty(), isForceLocalScheduling(session), s3SelectPushdownEnabled);
// on the input format to obtain file splits.
if (!isHudiParquetInputFormat(inputFormat) && shouldUseFileSplitsFromInputFormat(inputFormat)) {
if (tableBucketInfo.isPresent()) {
throw new PrestoException(NOT_SUPPORTED, "Presto cannot read bucketed partition in an input format with UseFileSplitsFromInputFormat annotation: " + inputFormat.getClass().getSimpleName());
}
if (AcidUtils.isTransactionalTable(table.getParameters())) {
throw new PrestoException(NOT_SUPPORTED, "Hive transactional tables in an input format with UseFileSplitsFromInputFormat annotation are not supported: " + inputFormat.getClass().getSimpleName());
}
FileInputFormat.setInputPaths(jobConf, path);
InputSplit[] splits = inputFormat.getSplits(jobConf, 0);
return addSplitsToSource(splits, splitFactory);
}
PathFilter pathFilter = isHudiParquetInputFormat(inputFormat) ? hoodiePathFilterSupplier.get() : path1 -> true;
// S3 Select pushdown works at the granularity of individual S3 objects,
// therefore we must not split files when it is enabled.
boolean splittable = getHeaderCount(schema) == 0 && getFooterCount(schema) == 0 && !s3SelectPushdownEnabled;
List<Path> readPaths;
Optional<DeleteDeltaLocations> deleteDeltaLocations;
long min = Long.MAX_VALUE;
long max = Long.MIN_VALUE;
if (AcidUtils.isTransactionalTable(table.getParameters())) {
boolean isVacuum = queryType.map(type -> type == QueryType.VACUUM).orElse(false);
AcidUtils.Directory directory = hdfsEnvironment.doAs(hdfsContext.getIdentity().getUser(), () -> {
ValidWriteIdList writeIdList = validWriteIds.orElseThrow(() -> new IllegalStateException("No validWriteIds present"));
if (isVacuum) {
writeIdList = new ValidCompactorWriteIdList(writeIdList.writeToString()) {
@Override
public RangeResponse isWriteIdRangeValid(long minWriteId, long maxWriteId) {
// For unknown reasons.. ValidCompactorWriteIdList#isWriteIdRangeValid() doesnot
// check for aborted transactions and AcidUtils.getAcidState() adds aborted transaction to both aborted and working lists.
// Avoid this by overriding.
RangeResponse writeIdRangeValid = super.isWriteIdRangeValid(minWriteId, maxWriteId);
if (writeIdRangeValid == RangeResponse.NONE) {
return RangeResponse.NONE;
} else if (super.isWriteIdRangeAborted(minWriteId, maxWriteId) == RangeResponse.ALL) {
return RangeResponse.NONE;
}
return writeIdRangeValid;
}
};
}
return AcidUtils.getAcidState(path, configuration, writeIdList, Ref.from(false), true, table.getParameters());
});
if (AcidUtils.isFullAcidTable(table.getParameters())) {
// From Hive version >= 3.0, delta/base files will always have file '_orc_acid_version' with value >= '2'.
Path baseOrDeltaPath = directory.getBaseDirectory() != null ? directory.getBaseDirectory() : (directory.getCurrentDirectories().size() > 0 ? directory.getCurrentDirectories().get(0).getPath() : null);
if (baseOrDeltaPath != null && AcidUtils.OrcAcidVersion.getAcidVersionFromMetaFile(baseOrDeltaPath, fs) < 2) {
throw new PrestoException(NOT_SUPPORTED, "Hive transactional tables are supported with Hive 3.0 and only after a major compaction has been run");
}
}
readPaths = new ArrayList<>();
boolean isFullVacuum = isVacuum ? Boolean.valueOf(queryInfo.get("FULL").toString()) : false;
if (isFullVacuum) {
// Base will contain everything
min = 0;
}
// In case of vacuum, include only in case of Full vacuum.
if (directory.getBaseDirectory() != null && (!isVacuum || isFullVacuum)) {
readPaths.add(directory.getBaseDirectory());
if (isVacuum) {
min = 0;
max = AcidUtils.parseBase(directory.getBaseDirectory());
}
}
// delta directories
for (AcidUtils.ParsedDelta delta : directory.getCurrentDirectories()) {
if (!delta.isDeleteDelta()) {
readPaths.add(delta.getPath());
} else // In case of Minor compaction, all delete_delta files should be compacted separately,
if (isVacuum && !isFullVacuum) {
readPaths.add(delta.getPath());
}
if (isVacuum) {
min = Math.min(delta.getMinWriteId(), min);
max = Math.max(delta.getMaxWriteId(), max);
}
}
// Create a registry of delete_delta directories for the partition
DeleteDeltaLocations.Builder deleteDeltaLocationsBuilder = DeleteDeltaLocations.builder(path);
for (AcidUtils.ParsedDelta delta : directory.getCurrentDirectories()) {
// In case of minor compaction, delete_delta directories should not be used for masking.
if (delta.isDeleteDelta() && (!isVacuum || isFullVacuum)) {
// For unknown reasons ParseDelta.getStatementId() returns 0, though parsed statement is -1;
// This creates issue while trying to locate the delete_delta directory.
// So parsing again.
OptionalInt statementId = getStatementId(delta.getPath().getName());
int stmtId = statementId.orElse(0);
deleteDeltaLocationsBuilder.addDeleteDelta(delta.getPath(), delta.getMinWriteId(), delta.getMaxWriteId(), stmtId);
}
}
deleteDeltaLocations = deleteDeltaLocationsBuilder.build();
if (!directory.getOriginalFiles().isEmpty()) {
LOG.info("Now supporting read from non-ACID files in ACID reader");
// non-ACID file
int numberOfBuckets = Integer.parseInt(schema.getProperty("bucket_count"));
long[] bucketStartRowOffset = new long[Integer.max(numberOfBuckets, 1)];
for (HadoopShims.HdfsFileStatusWithId f : directory.getOriginalFiles()) {
Path currFilePath = f.getFileStatus().getPath();
int currBucketNumber = getBucketNumber(currFilePath.getName()).getAsInt();
fileIterators.addLast(createInternalHiveSplitIterator(currFilePath, fs, splitFactory, splittable, deleteDeltaLocations, Optional.of(bucketStartRowOffset[currBucketNumber]), pathFilter));
try {
Reader copyReader = OrcFile.createReader(f.getFileStatus().getPath(), OrcFile.readerOptions(configuration));
bucketStartRowOffset[currBucketNumber] += copyReader.getNumberOfRows();
} catch (Exception e) {
throw new PrestoException(NOT_SUPPORTED, e.getMessage());
}
}
}
if (isVacuum && !readPaths.isEmpty()) {
Object vacuumHandle = queryInfo.get("vacuumHandle");
if (vacuumHandle != null && vacuumHandle instanceof HiveVacuumTableHandle) {
HiveVacuumTableHandle hiveVacuumTableHandle = (HiveVacuumTableHandle) vacuumHandle;
hiveVacuumTableHandle.addRange(partitionName, new Range(min, max));
}
}
} else {
readPaths = ImmutableList.of(path);
deleteDeltaLocations = Optional.empty();
}
// Bucketed partitions are fully loaded immediately since all files must be loaded to determine the file to bucket mapping
if (tableBucketInfo.isPresent()) {
// TODO document in addToQueue() that it is sufficient to hold on to last returned future
ListenableFuture<?> lastResult = immediateFuture(null);
for (Path readPath : readPaths) {
lastResult = hiveSplitSource.addToQueue(getBucketedSplits(readPath, fs, splitFactory, tableBucketInfo.get(), bucketConversion, getDeleteDeltaLocationFor(readPath, deleteDeltaLocations), pathFilter));
}
return lastResult;
}
for (Path readPath : readPaths) {
fileIterators.addLast(createInternalHiveSplitIterator(readPath, fs, splitFactory, splittable, getDeleteDeltaLocationFor(readPath, deleteDeltaLocations), Optional.empty(), pathFilter));
}
return COMPLETED_FUTURE;
}
use of io.prestosql.plugin.hive.HiveBucketing.BucketingVersion in project hetu-core by openlookeng.
the class HiveBucketProperty method fromStorageDescriptor.
public static Optional<HiveBucketProperty> fromStorageDescriptor(Map<String, String> tableParameters, StorageDescriptor storageDescriptor, String tablePartitionName) {
boolean bucketColsSet = storageDescriptor.isSetBucketCols() && !storageDescriptor.getBucketCols().isEmpty();
boolean numBucketsSet = storageDescriptor.isSetNumBuckets() && storageDescriptor.getNumBuckets() > 0;
if (!numBucketsSet) {
// In Hive, a table is considered as not bucketed when its bucketCols is set but its numBucket is not set.
return Optional.empty();
}
if (!bucketColsSet) {
throw new PrestoException(HIVE_INVALID_METADATA, "Table/partition metadata has 'numBuckets' set, but 'bucketCols' is not set: " + tablePartitionName);
}
List<SortingColumn> localSortedBy = ImmutableList.of();
if (storageDescriptor.isSetSortCols()) {
localSortedBy = storageDescriptor.getSortCols().stream().map(order -> SortingColumn.fromMetastoreApiOrder(order, tablePartitionName)).collect(toImmutableList());
}
BucketingVersion localBucketingVersion = HiveBucketing.getBucketingVersion(tableParameters);
return Optional.of(new HiveBucketProperty(storageDescriptor.getBucketCols(), localBucketingVersion, storageDescriptor.getNumBuckets(), localSortedBy));
}
Aggregations