use of com.facebook.presto.hive.util.InternalHiveSplitFactory in project presto by prestodb.
the class ManifestPartitionLoader method createInternalHiveSplitFactory.
private InternalHiveSplitFactory createInternalHiveSplitFactory(Table table, HivePartitionMetadata partition, ConnectorSession session, Optional<Domain> pathDomain, HdfsEnvironment hdfsEnvironment, HdfsContext hdfsContext, boolean schedulerUsesHostAddresses) throws IOException {
String partitionName = partition.getHivePartition().getPartitionId();
Storage storage = partition.getPartition().map(Partition::getStorage).orElse(table.getStorage());
String inputFormatName = storage.getStorageFormat().getInputFormat();
int partitionDataColumnCount = partition.getPartition().map(p -> p.getColumns().size()).orElse(table.getDataColumns().size());
List<HivePartitionKey> partitionKeys = getPartitionKeys(table, partition.getPartition(), partitionName);
Path path = new Path(getPartitionLocation(table, partition.getPartition()));
Configuration configuration = hdfsEnvironment.getConfiguration(hdfsContext, path);
InputFormat<?, ?> inputFormat = getInputFormat(configuration, inputFormatName, false);
ExtendedFileSystem fileSystem = hdfsEnvironment.getFileSystem(hdfsContext, path);
return new InternalHiveSplitFactory(fileSystem, inputFormat, pathDomain, getNodeSelectionStrategy(session), getMaxInitialSplitSize(session), false, new HiveSplitPartitionInfo(storage, path.toUri(), partitionKeys, partitionName, partitionDataColumnCount, partition.getTableToPartitionMapping(), Optional.empty(), partition.getRedundantColumnDomains()), schedulerUsesHostAddresses, partition.getEncryptionInformation());
}
use of com.facebook.presto.hive.util.InternalHiveSplitFactory in project presto by prestodb.
the class StoragePartitionLoader method getBucketedSplits.
private List<InternalHiveSplit> getBucketedSplits(Path path, ExtendedFileSystem fileSystem, InternalHiveSplitFactory splitFactory, BucketSplitInfo bucketSplitInfo, Optional<HiveSplit.BucketConversion> bucketConversion, String partitionName, boolean splittable, PathFilter pathFilter) {
int readBucketCount = bucketSplitInfo.getReadBucketCount();
int tableBucketCount = bucketSplitInfo.getTableBucketCount();
int partitionBucketCount = bucketConversion.map(HiveSplit.BucketConversion::getPartitionBucketCount).orElse(tableBucketCount);
int bucketCount = max(readBucketCount, partitionBucketCount);
checkState(readBucketCount <= tableBucketCount, "readBucketCount(%s) should be less than or equal to tableBucketCount(%s)", readBucketCount, tableBucketCount);
// list all files in the partition
List<HiveFileInfo> fileInfos = new ArrayList<>(partitionBucketCount);
try {
Iterators.addAll(fileInfos, directoryLister.list(fileSystem, table, path, namenodeStats, pathFilter, new HiveDirectoryContext(FAIL, isUseListDirectoryCache(session))));
} catch (HiveFileIterator.NestedDirectoryNotAllowedException e) {
// Fail here to be on the safe side. This seems to be the same as what Hive does
throw new PrestoException(HIVE_INVALID_BUCKET_FILES, format("Hive table '%s' is corrupt. Found sub-directory in bucket directory for partition: %s", table.getSchemaTableName(), partitionName));
}
ListMultimap<Integer, HiveFileInfo> bucketToFileInfo = ArrayListMultimap.create();
if (!shouldCreateFilesForMissingBuckets(table, session)) {
fileInfos.stream().forEach(fileInfo -> {
String fileName = fileInfo.getPath().getName();
OptionalInt bucket = getBucketNumber(fileName);
if (bucket.isPresent()) {
bucketToFileInfo.put(bucket.getAsInt(), fileInfo);
} else {
throw new PrestoException(HIVE_INVALID_BUCKET_FILES, format("invalid hive bucket file name: %s", fileName));
}
});
} else {
// build mapping of file name to bucket
for (HiveFileInfo file : fileInfos) {
String fileName = file.getPath().getName();
OptionalInt bucket = getBucketNumber(fileName);
if (bucket.isPresent()) {
bucketToFileInfo.put(bucket.getAsInt(), file);
continue;
}
// legacy mode requires exactly one file per bucket
if (fileInfos.size() != partitionBucketCount) {
throw new PrestoException(HIVE_INVALID_BUCKET_FILES, format("Hive table '%s' is corrupt. File '%s' does not match the standard naming pattern, and the number " + "of files in the directory (%s) does not match the declared bucket count (%s) for partition: %s", table.getSchemaTableName(), fileName, fileInfos.size(), partitionBucketCount, partitionName));
}
if (fileInfos.get(0).getPath().getName().matches("\\d+")) {
try {
// File names are integer if they are created when file_renaming_enabled is set to true
fileInfos.sort(Comparator.comparingInt(fileInfo -> Integer.parseInt(fileInfo.getPath().getName())));
} catch (NumberFormatException e) {
throw new PrestoException(HIVE_INVALID_FILE_NAMES, format("Hive table '%s' is corrupt. Some of the filenames in the partition: %s are not integers", new SchemaTableName(table.getDatabaseName(), table.getTableName()), partitionName));
}
} else {
// Sort FileStatus objects (instead of, e.g., fileStatus.getPath().toString). This matches org.apache.hadoop.hive.ql.metadata.Table.getSortedPaths
fileInfos.sort(null);
}
// Use position in sorted list as the bucket number
bucketToFileInfo.clear();
for (int i = 0; i < fileInfos.size(); i++) {
bucketToFileInfo.put(i, fileInfos.get(i));
}
break;
}
}
// convert files internal splits
List<InternalHiveSplit> splitList = new ArrayList<>();
for (int bucketNumber = 0; bucketNumber < bucketCount; bucketNumber++) {
// Physical bucket #. This determine file name. It also determines the order of splits in the result.
int partitionBucketNumber = bucketNumber % partitionBucketCount;
if (!bucketToFileInfo.containsKey(partitionBucketNumber)) {
continue;
}
// Logical bucket #. Each logical bucket corresponds to a "bucket" from engine's perspective.
int readBucketNumber = bucketNumber % readBucketCount;
boolean containsIneligibleTableBucket = false;
List<Integer> eligibleTableBucketNumbers = new ArrayList<>();
for (int tableBucketNumber = bucketNumber % tableBucketCount; tableBucketNumber < tableBucketCount; tableBucketNumber += bucketCount) {
// table bucket number: this is used for evaluating "$bucket" filters.
if (bucketSplitInfo.isTableBucketEnabled(tableBucketNumber)) {
eligibleTableBucketNumbers.add(tableBucketNumber);
} else {
containsIneligibleTableBucket = true;
}
}
if (!eligibleTableBucketNumbers.isEmpty() && containsIneligibleTableBucket) {
throw new PrestoException(NOT_SUPPORTED, "The bucket filter cannot be satisfied. There are restrictions on the bucket filter when all the following is true: " + "1. a table has a different buckets count as at least one of its partitions that is read in this query; " + "2. the table has a different but compatible bucket number with another table in the query; " + "3. some buckets of the table is filtered out from the query, most likely using a filter on \"$bucket\". " + "(table name: " + table.getTableName() + ", table bucket count: " + tableBucketCount + ", " + "partition bucket count: " + partitionBucketCount + ", effective reading bucket count: " + readBucketCount + ")");
}
if (!eligibleTableBucketNumbers.isEmpty()) {
for (HiveFileInfo fileInfo : bucketToFileInfo.get(partitionBucketNumber)) {
eligibleTableBucketNumbers.stream().map(tableBucketNumber -> splitFactory.createInternalHiveSplit(fileInfo, readBucketNumber, tableBucketNumber, splittable)).forEach(optionalSplit -> optionalSplit.ifPresent(splitList::add));
}
}
}
return splitList;
}
use of com.facebook.presto.hive.util.InternalHiveSplitFactory in project presto by prestodb.
the class StoragePartitionLoader method loadPartition.
@Override
public ListenableFuture<?> loadPartition(HivePartitionMetadata partition, HiveSplitSource hiveSplitSource, boolean stopped) throws IOException {
String partitionName = partition.getHivePartition().getPartitionId();
Storage storage = partition.getPartition().map(Partition::getStorage).orElse(table.getStorage());
Properties schema = getPartitionSchema(table, partition.getPartition());
String inputFormatName = storage.getStorageFormat().getInputFormat();
int partitionDataColumnCount = partition.getPartition().map(p -> p.getColumns().size()).orElse(table.getDataColumns().size());
List<HivePartitionKey> partitionKeys = getPartitionKeys(table, partition.getPartition(), partitionName);
String location = getPartitionLocation(table, partition.getPartition());
if (location.isEmpty()) {
checkState(!shouldCreateFilesForMissingBuckets(table, session), "Empty location is only allowed for empty temporary table when zero-row file is not created");
return COMPLETED_FUTURE;
}
Path path = new Path(location);
Configuration configuration = hdfsEnvironment.getConfiguration(hdfsContext, path);
InputFormat<?, ?> inputFormat = getInputFormat(configuration, inputFormatName, false);
ExtendedFileSystem fs = hdfsEnvironment.getFileSystem(hdfsContext, path);
boolean s3SelectPushdownEnabled = shouldEnablePushdownForTable(session, table, path.toString(), partition.getPartition());
if (inputFormat instanceof SymlinkTextInputFormat) {
if (tableBucketInfo.isPresent()) {
throw new PrestoException(NOT_SUPPORTED, "Bucketed table in SymlinkTextInputFormat is not yet supported");
}
// TODO: This should use an iterator like the HiveFileIterator
ListenableFuture<?> lastResult = COMPLETED_FUTURE;
for (Path targetPath : getTargetPathsFromSymlink(fs, path)) {
// The input should be in TextInputFormat.
TextInputFormat targetInputFormat = new TextInputFormat();
// the splits must be generated using the file system for the target path
// get the configuration for the target path -- it may be a different hdfs instance
ExtendedFileSystem targetFilesystem = hdfsEnvironment.getFileSystem(hdfsContext, targetPath);
JobConf targetJob = toJobConf(targetFilesystem.getConf());
targetJob.setInputFormat(TextInputFormat.class);
targetInputFormat.configure(targetJob);
FileInputFormat.setInputPaths(targetJob, targetPath);
InputSplit[] targetSplits = targetInputFormat.getSplits(targetJob, 0);
InternalHiveSplitFactory splitFactory = new InternalHiveSplitFactory(targetFilesystem, inputFormat, pathDomain, getNodeSelectionStrategy(session), getMaxInitialSplitSize(session), s3SelectPushdownEnabled, new HiveSplitPartitionInfo(storage, path.toUri(), partitionKeys, partitionName, partitionDataColumnCount, partition.getTableToPartitionMapping(), Optional.empty(), partition.getRedundantColumnDomains()), schedulerUsesHostAddresses, partition.getEncryptionInformation());
lastResult = addSplitsToSource(targetSplits, splitFactory, hiveSplitSource, stopped);
if (stopped) {
return COMPLETED_FUTURE;
}
}
return lastResult;
}
Optional<HiveSplit.BucketConversion> bucketConversion = Optional.empty();
boolean bucketConversionRequiresWorkerParticipation = false;
if (partition.getPartition().isPresent()) {
Optional<HiveBucketProperty> partitionBucketProperty = partition.getPartition().get().getStorage().getBucketProperty();
if (tableBucketInfo.isPresent() && partitionBucketProperty.isPresent()) {
int tableBucketCount = tableBucketInfo.get().getTableBucketCount();
int partitionBucketCount = partitionBucketProperty.get().getBucketCount();
// Here, it's just trying to see if its needs the BucketConversion.
if (tableBucketCount != partitionBucketCount) {
bucketConversion = Optional.of(new HiveSplit.BucketConversion(tableBucketCount, partitionBucketCount, tableBucketInfo.get().getBucketColumns()));
if (tableBucketCount > partitionBucketCount) {
bucketConversionRequiresWorkerParticipation = true;
}
}
}
}
InternalHiveSplitFactory splitFactory = new InternalHiveSplitFactory(fs, inputFormat, pathDomain, getNodeSelectionStrategy(session), getMaxInitialSplitSize(session), s3SelectPushdownEnabled, new HiveSplitPartitionInfo(storage, path.toUri(), partitionKeys, partitionName, partitionDataColumnCount, partition.getTableToPartitionMapping(), bucketConversionRequiresWorkerParticipation ? bucketConversion : Optional.empty(), partition.getRedundantColumnDomains()), schedulerUsesHostAddresses, partition.getEncryptionInformation());
if (shouldUseFileSplitsFromInputFormat(inputFormat, configuration, table.getStorage().getLocation())) {
if (tableBucketInfo.isPresent()) {
throw new PrestoException(NOT_SUPPORTED, "Presto cannot read bucketed partition in an input format with UseFileSplitsFromInputFormat annotation: " + inputFormat.getClass().getSimpleName());
}
JobConf jobConf = toJobConf(configuration);
FileInputFormat.setInputPaths(jobConf, path);
// SerDes parameters and Table parameters passing into input format
fromProperties(schema).forEach(jobConf::set);
InputSplit[] splits = inputFormat.getSplits(jobConf, 0);
return addSplitsToSource(splits, splitFactory, hiveSplitSource, stopped);
}
PathFilter pathFilter = isHudiParquetInputFormat(inputFormat) ? hoodiePathFilterLoadingCache.getUnchecked(configuration) : path1 -> true;
// Streaming aggregation works at the granularity of individual files
// S3 Select pushdown works at the granularity of individual S3 objects,
// Partial aggregation pushdown works at the granularity of individual files
// therefore we must not split files when either is enabled.
// Skip header / footer lines are not splittable except for a special case when skip.header.line.count=1
boolean splittable = isFileSplittable(session) && !isStreamingAggregationEnabled(session) && !s3SelectPushdownEnabled && !partialAggregationsPushedDown && getFooterCount(schema) == 0 && getHeaderCount(schema) <= 1;
// Bucketed partitions are fully loaded immediately since all files must be loaded to determine the file to bucket mapping
if (tableBucketInfo.isPresent()) {
if (tableBucketInfo.get().isVirtuallyBucketed()) {
// For virtual bucket, bucket conversion must not be present because there is no physical partition bucket count
checkState(!bucketConversion.isPresent(), "Virtually bucketed table must not have partitions that are physically bucketed");
checkState(tableBucketInfo.get().getTableBucketCount() == tableBucketInfo.get().getReadBucketCount(), "Table and read bucket count should be the same for virtual bucket");
return hiveSplitSource.addToQueue(getVirtuallyBucketedSplits(path, fs, splitFactory, tableBucketInfo.get().getReadBucketCount(), splittable, pathFilter));
}
return hiveSplitSource.addToQueue(getBucketedSplits(path, fs, splitFactory, tableBucketInfo.get(), bucketConversion, partitionName, splittable, pathFilter));
}
fileIterators.addLast(createInternalHiveSplitIterator(path, fs, splitFactory, splittable, pathFilter, partition.getPartition()));
return COMPLETED_FUTURE;
}
use of com.facebook.presto.hive.util.InternalHiveSplitFactory in project presto by prestodb.
the class ManifestPartitionLoader method loadPartition.
public ListenableFuture<?> loadPartition(HivePartitionMetadata partition, HiveSplitSource hiveSplitSource, boolean stopped) throws IOException {
Path path = new Path(getPartitionLocation(table, partition.getPartition()));
Map<String, String> parameters = partition.getPartition().get().getParameters();
// TODO: Add support for more manifest versions
// Verify the manifest version
verify(VERSION_1.equals(parameters.get(MANIFEST_VERSION)), "Manifest version is not equal to %s", VERSION_1);
List<String> fileNames = decompressFileNames(parameters.get(FILE_NAMES));
List<Long> fileSizes = decompressFileSizes(parameters.get(FILE_SIZES));
// Verify that the count of fileNames and fileSizes are same
verify(fileNames.size() == fileSizes.size(), "List of fileNames and fileSizes differ in length");
if (isManifestVerificationEnabled(session)) {
// Verify that the file names and sizes in manifest are the same as listed by directory lister
validateManifest(partition, path, fileNames, fileSizes);
}
ImmutableList.Builder<HiveFileInfo> fileListBuilder = ImmutableList.builder();
for (int i = 0; i < fileNames.size(); i++) {
Path filePath = new Path(path, fileNames.get(i));
FileStatus fileStatus = new FileStatus(fileSizes.get(i), false, 1, getMaxSplitSize(session).toBytes(), 0, filePath);
try {
BlockLocation[] locations = new BlockLocation[] { new BlockLocation(BLOCK_LOCATION_NAMES, BLOCK_LOCATION_HOSTS, 0, fileSizes.get(i)) };
// It is safe to set extraFileContext as empty because downstream code always checks if its present before proceeding.
fileListBuilder.add(HiveFileInfo.createHiveFileInfo(new LocatedFileStatus(fileStatus, locations), Optional.empty()));
} catch (IOException e) {
throw new UncheckedIOException(e);
}
}
InternalHiveSplitFactory splitFactory = createInternalHiveSplitFactory(table, partition, session, pathDomain, hdfsEnvironment, hdfsContext, schedulerUsesHostAddresses);
return hiveSplitSource.addToQueue(fileListBuilder.build().stream().map(status -> splitFactory.createInternalHiveSplit(status, true)).filter(Optional::isPresent).map(Optional::get).collect(toImmutableList()));
}
Aggregations