use of com.facebook.presto.hive.util.HiveFileIterator in project presto by prestodb.
the class BackgroundHiveSplitLoader method loadPartition.
private void loadPartition(HivePartitionMetadata partition) throws IOException {
String partitionName = partition.getHivePartition().getPartitionId();
Properties schema = getPartitionSchema(table, partition.getPartition());
List<HivePartitionKey> partitionKeys = getPartitionKeys(table, partition.getPartition());
TupleDomain<HiveColumnHandle> effectivePredicate = partition.getHivePartition().getEffectivePredicate();
Path path = new Path(getPartitionLocation(table, partition.getPartition()));
Configuration configuration = hdfsEnvironment.getConfiguration(path);
InputFormat<?, ?> inputFormat = getInputFormat(configuration, schema, false);
FileSystem fs = hdfsEnvironment.getFileSystem(session.getUser(), path);
if (inputFormat instanceof SymlinkTextInputFormat) {
if (bucketHandle.isPresent()) {
throw new PrestoException(StandardErrorCode.NOT_SUPPORTED, "Bucketed table in SymlinkTextInputFormat is not yet supported");
}
// TODO: This should use an iterator like the HiveFileIterator
for (Path targetPath : getTargetPathsFromSymlink(fs, path)) {
// The input should be in TextInputFormat.
TextInputFormat targetInputFormat = new TextInputFormat();
// get the configuration for the target path -- it may be a different hdfs instance
Configuration targetConfiguration = hdfsEnvironment.getConfiguration(targetPath);
JobConf targetJob = new JobConf(targetConfiguration);
targetJob.setInputFormat(TextInputFormat.class);
targetInputFormat.configure(targetJob);
FileInputFormat.setInputPaths(targetJob, targetPath);
InputSplit[] targetSplits = targetInputFormat.getSplits(targetJob, 0);
if (addSplitsToSource(targetSplits, partitionName, partitionKeys, schema, effectivePredicate, partition.getColumnCoercions())) {
return;
}
}
return;
}
// on the input format to obtain file splits.
if (shouldUseFileSplitsFromInputFormat(inputFormat)) {
JobConf jobConf = new JobConf(configuration);
FileInputFormat.setInputPaths(jobConf, path);
InputSplit[] splits = inputFormat.getSplits(jobConf, 0);
addSplitsToSource(splits, partitionName, partitionKeys, schema, effectivePredicate, partition.getColumnCoercions());
return;
}
// If only one bucket could match: load that one file
HiveFileIterator iterator = new HiveFileIterator(path, fs, directoryLister, namenodeStats, partitionName, inputFormat, schema, partitionKeys, effectivePredicate, partition.getColumnCoercions());
if (!buckets.isEmpty()) {
int bucketCount = buckets.get(0).getBucketCount();
List<LocatedFileStatus> list = listAndSortBucketFiles(iterator, bucketCount);
List<Iterator<HiveSplit>> iteratorList = new ArrayList<>();
for (HiveBucket bucket : buckets) {
int bucketNumber = bucket.getBucketNumber();
LocatedFileStatus file = list.get(bucketNumber);
boolean splittable = isSplittable(iterator.getInputFormat(), hdfsEnvironment.getFileSystem(session.getUser(), file.getPath()), file.getPath());
iteratorList.add(createHiveSplitIterator(iterator.getPartitionName(), file.getPath().toString(), file.getBlockLocations(), 0, file.getLen(), iterator.getSchema(), iterator.getPartitionKeys(), splittable, session, OptionalInt.of(bucketNumber), effectivePredicate, partition.getColumnCoercions()));
}
addToHiveSplitSourceRoundRobin(iteratorList);
return;
}
// If table is bucketed: list the directory, sort, tag with bucket id
if (bucketHandle.isPresent()) {
// HiveFileIterator skips hidden files automatically.
int bucketCount = bucketHandle.get().getBucketCount();
List<LocatedFileStatus> list = listAndSortBucketFiles(iterator, bucketCount);
List<Iterator<HiveSplit>> iteratorList = new ArrayList<>();
for (int bucketIndex = 0; bucketIndex < bucketCount; bucketIndex++) {
LocatedFileStatus file = list.get(bucketIndex);
boolean splittable = isSplittable(iterator.getInputFormat(), hdfsEnvironment.getFileSystem(session.getUser(), file.getPath()), file.getPath());
iteratorList.add(createHiveSplitIterator(iterator.getPartitionName(), file.getPath().toString(), file.getBlockLocations(), 0, file.getLen(), iterator.getSchema(), iterator.getPartitionKeys(), splittable, session, OptionalInt.of(bucketIndex), iterator.getEffectivePredicate(), partition.getColumnCoercions()));
}
addToHiveSplitSourceRoundRobin(iteratorList);
return;
}
fileIterators.addLast(iterator);
}
use of com.facebook.presto.hive.util.HiveFileIterator in project presto by prestodb.
the class BackgroundHiveSplitLoader method loadSplits.
private CompletableFuture<?> loadSplits() throws IOException {
HiveFileIterator files = fileIterators.poll();
if (files == null) {
HivePartitionMetadata partition = partitions.poll();
if (partition == null) {
return COMPLETED_FUTURE;
}
loadPartition(partition);
return COMPLETED_FUTURE;
}
while (files.hasNext() && !stopped) {
LocatedFileStatus file = files.next();
if (isDirectory(file)) {
if (recursiveDirWalkerEnabled) {
HiveFileIterator fileIterator = new HiveFileIterator(file.getPath(), files.getFileSystem(), files.getDirectoryLister(), files.getNamenodeStats(), files.getPartitionName(), files.getInputFormat(), files.getSchema(), files.getPartitionKeys(), files.getEffectivePredicate(), files.getColumnCoercions());
fileIterators.add(fileIterator);
}
} else {
boolean splittable = isSplittable(files.getInputFormat(), hdfsEnvironment.getFileSystem(session.getUser(), file.getPath()), file.getPath());
CompletableFuture<?> future = hiveSplitSource.addToQueue(createHiveSplitIterator(files.getPartitionName(), file.getPath().toString(), file.getBlockLocations(), 0, file.getLen(), files.getSchema(), files.getPartitionKeys(), splittable, session, OptionalInt.empty(), files.getEffectivePredicate(), files.getColumnCoercions()));
if (!future.isDone()) {
fileIterators.addFirst(files);
return future;
}
}
}
// No need to put the iterator back, since it's either empty or we've stopped
return COMPLETED_FUTURE;
}
use of com.facebook.presto.hive.util.HiveFileIterator in project presto by prestodb.
the class StoragePartitionLoader method loadPartition.
@Override
public ListenableFuture<?> loadPartition(HivePartitionMetadata partition, HiveSplitSource hiveSplitSource, boolean stopped) throws IOException {
String partitionName = partition.getHivePartition().getPartitionId();
Storage storage = partition.getPartition().map(Partition::getStorage).orElse(table.getStorage());
Properties schema = getPartitionSchema(table, partition.getPartition());
String inputFormatName = storage.getStorageFormat().getInputFormat();
int partitionDataColumnCount = partition.getPartition().map(p -> p.getColumns().size()).orElse(table.getDataColumns().size());
List<HivePartitionKey> partitionKeys = getPartitionKeys(table, partition.getPartition(), partitionName);
String location = getPartitionLocation(table, partition.getPartition());
if (location.isEmpty()) {
checkState(!shouldCreateFilesForMissingBuckets(table, session), "Empty location is only allowed for empty temporary table when zero-row file is not created");
return COMPLETED_FUTURE;
}
Path path = new Path(location);
Configuration configuration = hdfsEnvironment.getConfiguration(hdfsContext, path);
InputFormat<?, ?> inputFormat = getInputFormat(configuration, inputFormatName, false);
ExtendedFileSystem fs = hdfsEnvironment.getFileSystem(hdfsContext, path);
boolean s3SelectPushdownEnabled = shouldEnablePushdownForTable(session, table, path.toString(), partition.getPartition());
if (inputFormat instanceof SymlinkTextInputFormat) {
if (tableBucketInfo.isPresent()) {
throw new PrestoException(NOT_SUPPORTED, "Bucketed table in SymlinkTextInputFormat is not yet supported");
}
// TODO: This should use an iterator like the HiveFileIterator
ListenableFuture<?> lastResult = COMPLETED_FUTURE;
for (Path targetPath : getTargetPathsFromSymlink(fs, path)) {
// The input should be in TextInputFormat.
TextInputFormat targetInputFormat = new TextInputFormat();
// the splits must be generated using the file system for the target path
// get the configuration for the target path -- it may be a different hdfs instance
ExtendedFileSystem targetFilesystem = hdfsEnvironment.getFileSystem(hdfsContext, targetPath);
JobConf targetJob = toJobConf(targetFilesystem.getConf());
targetJob.setInputFormat(TextInputFormat.class);
targetInputFormat.configure(targetJob);
FileInputFormat.setInputPaths(targetJob, targetPath);
InputSplit[] targetSplits = targetInputFormat.getSplits(targetJob, 0);
InternalHiveSplitFactory splitFactory = new InternalHiveSplitFactory(targetFilesystem, inputFormat, pathDomain, getNodeSelectionStrategy(session), getMaxInitialSplitSize(session), s3SelectPushdownEnabled, new HiveSplitPartitionInfo(storage, path.toUri(), partitionKeys, partitionName, partitionDataColumnCount, partition.getTableToPartitionMapping(), Optional.empty(), partition.getRedundantColumnDomains()), schedulerUsesHostAddresses, partition.getEncryptionInformation());
lastResult = addSplitsToSource(targetSplits, splitFactory, hiveSplitSource, stopped);
if (stopped) {
return COMPLETED_FUTURE;
}
}
return lastResult;
}
Optional<HiveSplit.BucketConversion> bucketConversion = Optional.empty();
boolean bucketConversionRequiresWorkerParticipation = false;
if (partition.getPartition().isPresent()) {
Optional<HiveBucketProperty> partitionBucketProperty = partition.getPartition().get().getStorage().getBucketProperty();
if (tableBucketInfo.isPresent() && partitionBucketProperty.isPresent()) {
int tableBucketCount = tableBucketInfo.get().getTableBucketCount();
int partitionBucketCount = partitionBucketProperty.get().getBucketCount();
// Here, it's just trying to see if its needs the BucketConversion.
if (tableBucketCount != partitionBucketCount) {
bucketConversion = Optional.of(new HiveSplit.BucketConversion(tableBucketCount, partitionBucketCount, tableBucketInfo.get().getBucketColumns()));
if (tableBucketCount > partitionBucketCount) {
bucketConversionRequiresWorkerParticipation = true;
}
}
}
}
InternalHiveSplitFactory splitFactory = new InternalHiveSplitFactory(fs, inputFormat, pathDomain, getNodeSelectionStrategy(session), getMaxInitialSplitSize(session), s3SelectPushdownEnabled, new HiveSplitPartitionInfo(storage, path.toUri(), partitionKeys, partitionName, partitionDataColumnCount, partition.getTableToPartitionMapping(), bucketConversionRequiresWorkerParticipation ? bucketConversion : Optional.empty(), partition.getRedundantColumnDomains()), schedulerUsesHostAddresses, partition.getEncryptionInformation());
if (shouldUseFileSplitsFromInputFormat(inputFormat, configuration, table.getStorage().getLocation())) {
if (tableBucketInfo.isPresent()) {
throw new PrestoException(NOT_SUPPORTED, "Presto cannot read bucketed partition in an input format with UseFileSplitsFromInputFormat annotation: " + inputFormat.getClass().getSimpleName());
}
JobConf jobConf = toJobConf(configuration);
FileInputFormat.setInputPaths(jobConf, path);
// SerDes parameters and Table parameters passing into input format
fromProperties(schema).forEach(jobConf::set);
InputSplit[] splits = inputFormat.getSplits(jobConf, 0);
return addSplitsToSource(splits, splitFactory, hiveSplitSource, stopped);
}
PathFilter pathFilter = isHudiParquetInputFormat(inputFormat) ? hoodiePathFilterLoadingCache.getUnchecked(configuration) : path1 -> true;
// Streaming aggregation works at the granularity of individual files
// S3 Select pushdown works at the granularity of individual S3 objects,
// Partial aggregation pushdown works at the granularity of individual files
// therefore we must not split files when either is enabled.
// Skip header / footer lines are not splittable except for a special case when skip.header.line.count=1
boolean splittable = isFileSplittable(session) && !isStreamingAggregationEnabled(session) && !s3SelectPushdownEnabled && !partialAggregationsPushedDown && getFooterCount(schema) == 0 && getHeaderCount(schema) <= 1;
// Bucketed partitions are fully loaded immediately since all files must be loaded to determine the file to bucket mapping
if (tableBucketInfo.isPresent()) {
if (tableBucketInfo.get().isVirtuallyBucketed()) {
// For virtual bucket, bucket conversion must not be present because there is no physical partition bucket count
checkState(!bucketConversion.isPresent(), "Virtually bucketed table must not have partitions that are physically bucketed");
checkState(tableBucketInfo.get().getTableBucketCount() == tableBucketInfo.get().getReadBucketCount(), "Table and read bucket count should be the same for virtual bucket");
return hiveSplitSource.addToQueue(getVirtuallyBucketedSplits(path, fs, splitFactory, tableBucketInfo.get().getReadBucketCount(), splittable, pathFilter));
}
return hiveSplitSource.addToQueue(getBucketedSplits(path, fs, splitFactory, tableBucketInfo.get(), bucketConversion, partitionName, splittable, pathFilter));
}
fileIterators.addLast(createInternalHiveSplitIterator(path, fs, splitFactory, splittable, pathFilter, partition.getPartition()));
return COMPLETED_FUTURE;
}
Aggregations