Search in sources :

Example 1 with InternalHiveBlock

use of io.trino.plugin.hive.InternalHiveSplit.InternalHiveBlock in project trino by trinodb.

the class HiveSplitSource method getNextBatch.

@Override
public CompletableFuture<ConnectorSplitBatch> getNextBatch(ConnectorPartitionHandle partitionHandle, int maxSize) {
    boolean noMoreSplits;
    State state = stateReference.get();
    switch(state.getKind()) {
        case INITIAL:
            noMoreSplits = false;
            break;
        case NO_MORE_SPLITS:
            noMoreSplits = true;
            break;
        case FAILED:
            return failedFuture(state.getThrowable());
        case CLOSED:
            throw new IllegalStateException("HiveSplitSource is already closed");
        default:
            throw new UnsupportedOperationException();
    }
    OptionalInt bucketNumber = toBucketNumber(partitionHandle);
    ListenableFuture<List<ConnectorSplit>> future = queues.borrowBatchAsync(bucketNumber, maxSize, internalSplits -> {
        ImmutableList.Builder<InternalHiveSplit> splitsToInsertBuilder = ImmutableList.builder();
        ImmutableList.Builder<ConnectorSplit> resultBuilder = ImmutableList.builder();
        int removedEstimatedSizeInBytes = 0;
        int removedSplitCount = 0;
        for (InternalHiveSplit internalSplit : internalSplits) {
            // Perform one more dynamic filter check immediately before split is returned to the engine
            if (!internalSplit.getPartitionMatchSupplier().getAsBoolean()) {
                removedEstimatedSizeInBytes += internalSplit.getEstimatedSizeInBytes();
                removedSplitCount++;
                continue;
            }
            long maxSplitBytes = maxSplitSize.toBytes();
            if (remainingInitialSplits.get() > 0) {
                if (remainingInitialSplits.getAndDecrement() > 0) {
                    maxSplitBytes = maxInitialSplitSize.toBytes();
                }
            }
            InternalHiveBlock block = internalSplit.currentBlock();
            long splitBytes;
            if (internalSplit.isSplittable()) {
                long remainingBlockBytes = block.getEnd() - internalSplit.getStart();
                if (remainingBlockBytes <= maxSplitBytes) {
                    splitBytes = remainingBlockBytes;
                } else if (maxSplitBytes * 2 >= remainingBlockBytes) {
                    // Second to last split in this block, generate two evenly sized splits
                    splitBytes = remainingBlockBytes / 2;
                } else {
                    splitBytes = maxSplitBytes;
                }
            } else {
                splitBytes = internalSplit.getEnd() - internalSplit.getStart();
            }
            resultBuilder.add(new HiveSplit(databaseName, tableName, internalSplit.getPartitionName(), internalSplit.getPath(), internalSplit.getStart(), splitBytes, internalSplit.getEstimatedFileSize(), internalSplit.getFileModifiedTime(), internalSplit.getSchema(), internalSplit.getPartitionKeys(), block.getAddresses(), internalSplit.getBucketNumber(), internalSplit.getStatementId(), internalSplit.isForceLocalScheduling(), internalSplit.getTableToPartitionMapping(), internalSplit.getBucketConversion(), internalSplit.getBucketValidation(), internalSplit.isS3SelectPushdownEnabled(), internalSplit.getAcidInfo(), numberOfProcessedSplits.getAndIncrement(), splitWeightProvider.weightForSplitSizeInBytes(splitBytes)));
            internalSplit.increaseStart(splitBytes);
            if (internalSplit.isDone()) {
                removedEstimatedSizeInBytes += internalSplit.getEstimatedSizeInBytes();
                removedSplitCount++;
            } else {
                splitsToInsertBuilder.add(internalSplit);
            }
        }
        estimatedSplitSizeInBytes.addAndGet(-removedEstimatedSizeInBytes);
        bufferedInternalSplitCount.addAndGet(-removedSplitCount);
        List<InternalHiveSplit> splitsToInsert = splitsToInsertBuilder.build();
        List<ConnectorSplit> result = resultBuilder.build();
        return new AsyncQueue.BorrowResult<>(splitsToInsert, result);
    });
    ListenableFuture<ConnectorSplitBatch> transform = Futures.transform(future, splits -> {
        requireNonNull(splits, "splits is null");
        if (recordScannedFiles) {
            splits.forEach(split -> scannedFilePaths.add(((HiveSplit) split).getPath()));
        }
        if (noMoreSplits) {
            // But an extra invocation likely doesn't matter.
            return new ConnectorSplitBatch(splits, splits.isEmpty() && queues.isFinished(bucketNumber));
        } else {
            return new ConnectorSplitBatch(splits, false);
        }
    }, directExecutor());
    return toCompletableFuture(transform);
}
Also used : InternalHiveBlock(io.trino.plugin.hive.InternalHiveSplit.InternalHiveBlock) ImmutableList(com.google.common.collect.ImmutableList) OptionalInt(java.util.OptionalInt) BorrowResult(io.trino.plugin.hive.util.AsyncQueue.BorrowResult) Preconditions.checkState(com.google.common.base.Preconditions.checkState) List(java.util.List) ImmutableList(com.google.common.collect.ImmutableList) ConnectorSplit(io.trino.spi.connector.ConnectorSplit)

Example 2 with InternalHiveBlock

use of io.trino.plugin.hive.InternalHiveSplit.InternalHiveBlock in project trino by trinodb.

the class InternalHiveSplitFactory method createInternalHiveSplit.

private Optional<InternalHiveSplit> createInternalHiveSplit(Path path, BlockLocation[] blockLocations, long start, long length, // Estimated because, for example, encrypted S3 files may be padded, so reported size may not reflect actual size
long estimatedFileSize, long fileModificationTime, OptionalInt bucketNumber, boolean splittable, Optional<AcidInfo> acidInfo) {
    String pathString = path.toString();
    if (!pathMatchesPredicate(pathDomain, pathString)) {
        return Optional.empty();
    }
    // per HIVE-13040 empty files are allowed
    if (estimatedFileSize == 0) {
        return Optional.empty();
    }
    // but it might be ready when splits are enumerated lazily.
    if (!partitionMatchSupplier.getAsBoolean()) {
        return Optional.empty();
    }
    if (maxSplitFileSize.isPresent() && estimatedFileSize > maxSplitFileSize.get()) {
        return Optional.empty();
    }
    ImmutableList.Builder<InternalHiveBlock> blockBuilder = ImmutableList.builder();
    for (BlockLocation blockLocation : blockLocations) {
        // clamp the block range
        long blockStart = Math.max(start, blockLocation.getOffset());
        long blockEnd = Math.min(start + length, blockLocation.getOffset() + blockLocation.getLength());
        if (blockStart > blockEnd) {
            // block is outside split range
            continue;
        }
        if (blockStart == blockEnd && !(blockStart == start && blockEnd == start + length)) {
            // skip zero-width block, except in the special circumstance: slice is empty, and the block covers the empty slice interval.
            continue;
        }
        blockBuilder.add(new InternalHiveBlock(blockStart, blockEnd, getHostAddresses(blockLocation)));
    }
    List<InternalHiveBlock> blocks = blockBuilder.build();
    checkBlocks(path, blocks, start, length);
    if (!splittable) {
        // not splittable, use the hosts from the first block if it exists
        blocks = ImmutableList.of(new InternalHiveBlock(start, start + length, blocks.get(0).getAddresses()));
    }
    int bucketNumberIndex = bucketNumber.orElse(0);
    return Optional.of(new InternalHiveSplit(partitionName, pathString, start, start + length, estimatedFileSize, fileModificationTime, schema, partitionKeys, blocks, bucketNumber, () -> bucketStatementCounters.computeIfAbsent(bucketNumberIndex, index -> new AtomicInteger()).getAndIncrement(), splittable, forceLocalScheduling && allBlocksHaveAddress(blocks), tableToPartitionMapping, bucketConversion, bucketValidation, s3SelectPushdownEnabled && S3SelectPushdown.isCompressionCodecSupported(inputFormat, path), acidInfo, partitionMatchSupplier));
}
Also used : Arrays(java.util.Arrays) InternalHiveBlock(io.trino.plugin.hive.InternalHiveSplit.InternalHiveBlock) BlockLocation(org.apache.hadoop.fs.BlockLocation) FileSystem(org.apache.hadoop.fs.FileSystem) FileStatus(org.apache.hadoop.fs.FileStatus) OptionalInt(java.util.OptionalInt) BooleanSupplier(java.util.function.BooleanSupplier) Preconditions.checkArgument(com.google.common.base.Preconditions.checkArgument) FileSplit(org.apache.hadoop.mapred.FileSplit) ImmutableList(com.google.common.collect.ImmutableList) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) Map(java.util.Map) Objects.requireNonNull(java.util.Objects.requireNonNull) InputFormat(org.apache.hadoop.mapred.InputFormat) Path(org.apache.hadoop.fs.Path) Slices.utf8Slice(io.airlift.slice.Slices.utf8Slice) HiveColumnHandle.isPathColumnHandle(io.trino.plugin.hive.HiveColumnHandle.isPathColumnHandle) BucketConversion(io.trino.plugin.hive.HiveSplit.BucketConversion) HiveColumnHandle(io.trino.plugin.hive.HiveColumnHandle) InternalHiveSplit(io.trino.plugin.hive.InternalHiveSplit) AcidTransaction(io.trino.plugin.hive.acid.AcidTransaction) Properties(java.util.Properties) HivePartitionKey(io.trino.plugin.hive.HivePartitionKey) Domain(io.trino.spi.predicate.Domain) Collection(java.util.Collection) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) HiveSplit(io.trino.plugin.hive.HiveSplit) LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) HiveUtil.isSplittable(io.trino.plugin.hive.util.HiveUtil.isSplittable) IOException(java.io.IOException) TupleDomain(io.trino.spi.predicate.TupleDomain) AcidInfo(io.trino.plugin.hive.AcidInfo) UncheckedIOException(java.io.UncheckedIOException) DataSize(io.airlift.units.DataSize) List(java.util.List) TableToPartitionMapping(io.trino.plugin.hive.TableToPartitionMapping) S3SelectPushdown(io.trino.plugin.hive.s3select.S3SelectPushdown) Optional(java.util.Optional) HostAddress(io.trino.spi.HostAddress) InternalHiveBlock(io.trino.plugin.hive.InternalHiveSplit.InternalHiveBlock) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) ImmutableList(com.google.common.collect.ImmutableList) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) InternalHiveSplit(io.trino.plugin.hive.InternalHiveSplit) BlockLocation(org.apache.hadoop.fs.BlockLocation)

Aggregations

ImmutableList (com.google.common.collect.ImmutableList)2 InternalHiveBlock (io.trino.plugin.hive.InternalHiveSplit.InternalHiveBlock)2 List (java.util.List)2 OptionalInt (java.util.OptionalInt)2 Preconditions.checkArgument (com.google.common.base.Preconditions.checkArgument)1 Preconditions.checkState (com.google.common.base.Preconditions.checkState)1 ImmutableList.toImmutableList (com.google.common.collect.ImmutableList.toImmutableList)1 Slices.utf8Slice (io.airlift.slice.Slices.utf8Slice)1 DataSize (io.airlift.units.DataSize)1 AcidInfo (io.trino.plugin.hive.AcidInfo)1 HiveColumnHandle (io.trino.plugin.hive.HiveColumnHandle)1 HiveColumnHandle.isPathColumnHandle (io.trino.plugin.hive.HiveColumnHandle.isPathColumnHandle)1 HivePartitionKey (io.trino.plugin.hive.HivePartitionKey)1 HiveSplit (io.trino.plugin.hive.HiveSplit)1 BucketConversion (io.trino.plugin.hive.HiveSplit.BucketConversion)1 InternalHiveSplit (io.trino.plugin.hive.InternalHiveSplit)1 TableToPartitionMapping (io.trino.plugin.hive.TableToPartitionMapping)1 AcidTransaction (io.trino.plugin.hive.acid.AcidTransaction)1 S3SelectPushdown (io.trino.plugin.hive.s3select.S3SelectPushdown)1 BorrowResult (io.trino.plugin.hive.util.AsyncQueue.BorrowResult)1