use of io.trino.plugin.hive.InternalHiveSplit.InternalHiveBlock in project trino by trinodb.
the class HiveSplitSource method getNextBatch.
@Override
public CompletableFuture<ConnectorSplitBatch> getNextBatch(ConnectorPartitionHandle partitionHandle, int maxSize) {
boolean noMoreSplits;
State state = stateReference.get();
switch(state.getKind()) {
case INITIAL:
noMoreSplits = false;
break;
case NO_MORE_SPLITS:
noMoreSplits = true;
break;
case FAILED:
return failedFuture(state.getThrowable());
case CLOSED:
throw new IllegalStateException("HiveSplitSource is already closed");
default:
throw new UnsupportedOperationException();
}
OptionalInt bucketNumber = toBucketNumber(partitionHandle);
ListenableFuture<List<ConnectorSplit>> future = queues.borrowBatchAsync(bucketNumber, maxSize, internalSplits -> {
ImmutableList.Builder<InternalHiveSplit> splitsToInsertBuilder = ImmutableList.builder();
ImmutableList.Builder<ConnectorSplit> resultBuilder = ImmutableList.builder();
int removedEstimatedSizeInBytes = 0;
int removedSplitCount = 0;
for (InternalHiveSplit internalSplit : internalSplits) {
// Perform one more dynamic filter check immediately before split is returned to the engine
if (!internalSplit.getPartitionMatchSupplier().getAsBoolean()) {
removedEstimatedSizeInBytes += internalSplit.getEstimatedSizeInBytes();
removedSplitCount++;
continue;
}
long maxSplitBytes = maxSplitSize.toBytes();
if (remainingInitialSplits.get() > 0) {
if (remainingInitialSplits.getAndDecrement() > 0) {
maxSplitBytes = maxInitialSplitSize.toBytes();
}
}
InternalHiveBlock block = internalSplit.currentBlock();
long splitBytes;
if (internalSplit.isSplittable()) {
long remainingBlockBytes = block.getEnd() - internalSplit.getStart();
if (remainingBlockBytes <= maxSplitBytes) {
splitBytes = remainingBlockBytes;
} else if (maxSplitBytes * 2 >= remainingBlockBytes) {
// Second to last split in this block, generate two evenly sized splits
splitBytes = remainingBlockBytes / 2;
} else {
splitBytes = maxSplitBytes;
}
} else {
splitBytes = internalSplit.getEnd() - internalSplit.getStart();
}
resultBuilder.add(new HiveSplit(databaseName, tableName, internalSplit.getPartitionName(), internalSplit.getPath(), internalSplit.getStart(), splitBytes, internalSplit.getEstimatedFileSize(), internalSplit.getFileModifiedTime(), internalSplit.getSchema(), internalSplit.getPartitionKeys(), block.getAddresses(), internalSplit.getBucketNumber(), internalSplit.getStatementId(), internalSplit.isForceLocalScheduling(), internalSplit.getTableToPartitionMapping(), internalSplit.getBucketConversion(), internalSplit.getBucketValidation(), internalSplit.isS3SelectPushdownEnabled(), internalSplit.getAcidInfo(), numberOfProcessedSplits.getAndIncrement(), splitWeightProvider.weightForSplitSizeInBytes(splitBytes)));
internalSplit.increaseStart(splitBytes);
if (internalSplit.isDone()) {
removedEstimatedSizeInBytes += internalSplit.getEstimatedSizeInBytes();
removedSplitCount++;
} else {
splitsToInsertBuilder.add(internalSplit);
}
}
estimatedSplitSizeInBytes.addAndGet(-removedEstimatedSizeInBytes);
bufferedInternalSplitCount.addAndGet(-removedSplitCount);
List<InternalHiveSplit> splitsToInsert = splitsToInsertBuilder.build();
List<ConnectorSplit> result = resultBuilder.build();
return new AsyncQueue.BorrowResult<>(splitsToInsert, result);
});
ListenableFuture<ConnectorSplitBatch> transform = Futures.transform(future, splits -> {
requireNonNull(splits, "splits is null");
if (recordScannedFiles) {
splits.forEach(split -> scannedFilePaths.add(((HiveSplit) split).getPath()));
}
if (noMoreSplits) {
// But an extra invocation likely doesn't matter.
return new ConnectorSplitBatch(splits, splits.isEmpty() && queues.isFinished(bucketNumber));
} else {
return new ConnectorSplitBatch(splits, false);
}
}, directExecutor());
return toCompletableFuture(transform);
}
use of io.trino.plugin.hive.InternalHiveSplit.InternalHiveBlock in project trino by trinodb.
the class InternalHiveSplitFactory method createInternalHiveSplit.
private Optional<InternalHiveSplit> createInternalHiveSplit(Path path, BlockLocation[] blockLocations, long start, long length, // Estimated because, for example, encrypted S3 files may be padded, so reported size may not reflect actual size
long estimatedFileSize, long fileModificationTime, OptionalInt bucketNumber, boolean splittable, Optional<AcidInfo> acidInfo) {
String pathString = path.toString();
if (!pathMatchesPredicate(pathDomain, pathString)) {
return Optional.empty();
}
// per HIVE-13040 empty files are allowed
if (estimatedFileSize == 0) {
return Optional.empty();
}
// but it might be ready when splits are enumerated lazily.
if (!partitionMatchSupplier.getAsBoolean()) {
return Optional.empty();
}
if (maxSplitFileSize.isPresent() && estimatedFileSize > maxSplitFileSize.get()) {
return Optional.empty();
}
ImmutableList.Builder<InternalHiveBlock> blockBuilder = ImmutableList.builder();
for (BlockLocation blockLocation : blockLocations) {
// clamp the block range
long blockStart = Math.max(start, blockLocation.getOffset());
long blockEnd = Math.min(start + length, blockLocation.getOffset() + blockLocation.getLength());
if (blockStart > blockEnd) {
// block is outside split range
continue;
}
if (blockStart == blockEnd && !(blockStart == start && blockEnd == start + length)) {
// skip zero-width block, except in the special circumstance: slice is empty, and the block covers the empty slice interval.
continue;
}
blockBuilder.add(new InternalHiveBlock(blockStart, blockEnd, getHostAddresses(blockLocation)));
}
List<InternalHiveBlock> blocks = blockBuilder.build();
checkBlocks(path, blocks, start, length);
if (!splittable) {
// not splittable, use the hosts from the first block if it exists
blocks = ImmutableList.of(new InternalHiveBlock(start, start + length, blocks.get(0).getAddresses()));
}
int bucketNumberIndex = bucketNumber.orElse(0);
return Optional.of(new InternalHiveSplit(partitionName, pathString, start, start + length, estimatedFileSize, fileModificationTime, schema, partitionKeys, blocks, bucketNumber, () -> bucketStatementCounters.computeIfAbsent(bucketNumberIndex, index -> new AtomicInteger()).getAndIncrement(), splittable, forceLocalScheduling && allBlocksHaveAddress(blocks), tableToPartitionMapping, bucketConversion, bucketValidation, s3SelectPushdownEnabled && S3SelectPushdown.isCompressionCodecSupported(inputFormat, path), acidInfo, partitionMatchSupplier));
}
Aggregations