Search in sources :

Example 1 with SymlinkTextInputFormat

use of org.apache.hadoop.hive.ql.io.SymlinkTextInputFormat in project presto by prestodb.

the class BackgroundHiveSplitLoader method loadPartition.

private void loadPartition(HivePartitionMetadata partition) throws IOException {
    String partitionName = partition.getHivePartition().getPartitionId();
    Properties schema = getPartitionSchema(table, partition.getPartition());
    List<HivePartitionKey> partitionKeys = getPartitionKeys(table, partition.getPartition());
    TupleDomain<HiveColumnHandle> effectivePredicate = partition.getHivePartition().getEffectivePredicate();
    Path path = new Path(getPartitionLocation(table, partition.getPartition()));
    Configuration configuration = hdfsEnvironment.getConfiguration(path);
    InputFormat<?, ?> inputFormat = getInputFormat(configuration, schema, false);
    FileSystem fs = hdfsEnvironment.getFileSystem(session.getUser(), path);
    if (inputFormat instanceof SymlinkTextInputFormat) {
        if (bucketHandle.isPresent()) {
            throw new PrestoException(StandardErrorCode.NOT_SUPPORTED, "Bucketed table in SymlinkTextInputFormat is not yet supported");
        }
        // TODO: This should use an iterator like the HiveFileIterator
        for (Path targetPath : getTargetPathsFromSymlink(fs, path)) {
            // The input should be in TextInputFormat.
            TextInputFormat targetInputFormat = new TextInputFormat();
            // get the configuration for the target path -- it may be a different hdfs instance
            Configuration targetConfiguration = hdfsEnvironment.getConfiguration(targetPath);
            JobConf targetJob = new JobConf(targetConfiguration);
            targetJob.setInputFormat(TextInputFormat.class);
            targetInputFormat.configure(targetJob);
            FileInputFormat.setInputPaths(targetJob, targetPath);
            InputSplit[] targetSplits = targetInputFormat.getSplits(targetJob, 0);
            if (addSplitsToSource(targetSplits, partitionName, partitionKeys, schema, effectivePredicate, partition.getColumnCoercions())) {
                return;
            }
        }
        return;
    }
    // on the input format to obtain file splits.
    if (shouldUseFileSplitsFromInputFormat(inputFormat)) {
        JobConf jobConf = new JobConf(configuration);
        FileInputFormat.setInputPaths(jobConf, path);
        InputSplit[] splits = inputFormat.getSplits(jobConf, 0);
        addSplitsToSource(splits, partitionName, partitionKeys, schema, effectivePredicate, partition.getColumnCoercions());
        return;
    }
    // If only one bucket could match: load that one file
    HiveFileIterator iterator = new HiveFileIterator(path, fs, directoryLister, namenodeStats, partitionName, inputFormat, schema, partitionKeys, effectivePredicate, partition.getColumnCoercions());
    if (!buckets.isEmpty()) {
        int bucketCount = buckets.get(0).getBucketCount();
        List<LocatedFileStatus> list = listAndSortBucketFiles(iterator, bucketCount);
        List<Iterator<HiveSplit>> iteratorList = new ArrayList<>();
        for (HiveBucket bucket : buckets) {
            int bucketNumber = bucket.getBucketNumber();
            LocatedFileStatus file = list.get(bucketNumber);
            boolean splittable = isSplittable(iterator.getInputFormat(), hdfsEnvironment.getFileSystem(session.getUser(), file.getPath()), file.getPath());
            iteratorList.add(createHiveSplitIterator(iterator.getPartitionName(), file.getPath().toString(), file.getBlockLocations(), 0, file.getLen(), iterator.getSchema(), iterator.getPartitionKeys(), splittable, session, OptionalInt.of(bucketNumber), effectivePredicate, partition.getColumnCoercions()));
        }
        addToHiveSplitSourceRoundRobin(iteratorList);
        return;
    }
    // If table is bucketed: list the directory, sort, tag with bucket id
    if (bucketHandle.isPresent()) {
        // HiveFileIterator skips hidden files automatically.
        int bucketCount = bucketHandle.get().getBucketCount();
        List<LocatedFileStatus> list = listAndSortBucketFiles(iterator, bucketCount);
        List<Iterator<HiveSplit>> iteratorList = new ArrayList<>();
        for (int bucketIndex = 0; bucketIndex < bucketCount; bucketIndex++) {
            LocatedFileStatus file = list.get(bucketIndex);
            boolean splittable = isSplittable(iterator.getInputFormat(), hdfsEnvironment.getFileSystem(session.getUser(), file.getPath()), file.getPath());
            iteratorList.add(createHiveSplitIterator(iterator.getPartitionName(), file.getPath().toString(), file.getBlockLocations(), 0, file.getLen(), iterator.getSchema(), iterator.getPartitionKeys(), splittable, session, OptionalInt.of(bucketIndex), iterator.getEffectivePredicate(), partition.getColumnCoercions()));
        }
        addToHiveSplitSourceRoundRobin(iteratorList);
        return;
    }
    fileIterators.addLast(iterator);
}
Also used : HiveBucket(com.facebook.presto.hive.HiveBucketing.HiveBucket) Configuration(org.apache.hadoop.conf.Configuration) ArrayList(java.util.ArrayList) PrestoException(com.facebook.presto.spi.PrestoException) Properties(java.util.Properties) SymlinkTextInputFormat(org.apache.hadoop.hive.ql.io.SymlinkTextInputFormat) FileSystem(org.apache.hadoop.fs.FileSystem) PeekingIterator(com.google.common.collect.PeekingIterator) Iterator(java.util.Iterator) AbstractIterator(com.google.common.collect.AbstractIterator) HiveFileIterator(com.facebook.presto.hive.util.HiveFileIterator) HiveFileIterator(com.facebook.presto.hive.util.HiveFileIterator) JobConf(org.apache.hadoop.mapred.JobConf) InputSplit(org.apache.hadoop.mapred.InputSplit) Path(org.apache.hadoop.fs.Path) LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus) TextInputFormat(org.apache.hadoop.mapred.TextInputFormat) SymlinkTextInputFormat(org.apache.hadoop.hive.ql.io.SymlinkTextInputFormat)

Aggregations

HiveBucket (com.facebook.presto.hive.HiveBucketing.HiveBucket)1 HiveFileIterator (com.facebook.presto.hive.util.HiveFileIterator)1 PrestoException (com.facebook.presto.spi.PrestoException)1 AbstractIterator (com.google.common.collect.AbstractIterator)1 PeekingIterator (com.google.common.collect.PeekingIterator)1 ArrayList (java.util.ArrayList)1 Iterator (java.util.Iterator)1 Properties (java.util.Properties)1 Configuration (org.apache.hadoop.conf.Configuration)1 FileSystem (org.apache.hadoop.fs.FileSystem)1 LocatedFileStatus (org.apache.hadoop.fs.LocatedFileStatus)1 Path (org.apache.hadoop.fs.Path)1 SymlinkTextInputFormat (org.apache.hadoop.hive.ql.io.SymlinkTextInputFormat)1 InputSplit (org.apache.hadoop.mapred.InputSplit)1 JobConf (org.apache.hadoop.mapred.JobConf)1 TextInputFormat (org.apache.hadoop.mapred.TextInputFormat)1