Search in sources :

Example 1 with TempFileReader

use of io.prestosql.plugin.hive.util.TempFileReader in project hetu-core by openlookeng.

the class HiveWriterFactory method mergeSubFiles.

public void mergeSubFiles(List<HiveWriter> writers) throws IOException {
    if (writers.isEmpty()) {
        return;
    }
    FileSystem fileSystem = hdfsEnvironment.getFileSystem(session.getUser(), new Path(writers.get(0).getFilePath()), conf);
    List<Type> types = dataColumns.stream().map(column -> column.getHiveType().getType(typeManager)).collect(toList());
    for (HiveWriter writer : writers) {
        String filePath = writer.getFilePath();
        Path path = new Path(filePath);
        logContainingFolderInfo(fileSystem, path, "Merging snapshot files to result file: %s", path);
        // The snapshotSuffixes list records the "resumeCount" for each suffix.
        // It doesn't has an entry for the current set of files, so an entry is added first.
        // The resumeCount helps distinguish files created by different runs.
        snapshotSuffixes.add(resumeCount);
        for (int i = 0; i < snapshotSuffixes.size(); i++) {
            long resume = snapshotSuffixes.get(i);
            Path file = new Path(toSnapshotSubFile(filePath, resume, i));
            if (fileSystem.exists(file)) {
                // TODO-cp-I2BZ0A: assuming all files to be of ORC type.
                // Using same parameters as used by SortingFileWriter
                FileStatus fileStatus = fileSystem.getFileStatus(file);
                try (TempFileReader reader = new TempFileReader(types, new HdfsOrcDataSource(new OrcDataSourceId(file.toString()), fileStatus.getLen(), new DataSize(1, MEGABYTE), new DataSize(8, MEGABYTE), new DataSize(8, MEGABYTE), false, fileSystem.open(file), new FileFormatDataSourceStats(), fileStatus.getModificationTime()))) {
                    while (reader.hasNext()) {
                        writer.append(reader.next());
                    }
                }
            // DO NOT delete the sub file, in case we need to resume. Delete them when the query finishes.
            }
        }
    }
}
Also used : Path(org.apache.hadoop.fs.Path) DateTimeZone(org.joda.time.DateTimeZone) Arrays(java.util.Arrays) StorageFormat(io.prestosql.plugin.hive.metastore.StorageFormat) UpdateMode(io.prestosql.plugin.hive.PartitionUpdate.UpdateMode) FileSystem(org.apache.hadoop.fs.FileSystem) HIVE_FILESYSTEM_ERROR(io.prestosql.plugin.hive.HiveErrorCode.HIVE_FILESYSTEM_ERROR) HdfsContext(io.prestosql.plugin.hive.HdfsEnvironment.HdfsContext) FileStatus(org.apache.hadoop.fs.FileStatus) SortOrder(io.prestosql.spi.block.SortOrder) CompressionCodec(org.apache.hadoop.io.compress.CompressionCodec) Preconditions.checkArgument(com.google.common.base.Preconditions.checkArgument) ConnectorSession(io.prestosql.spi.connector.ConnectorSession) HiveIgnoreKeyTextOutputFormat(org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat) Collectors.toMap(java.util.stream.Collectors.toMap) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) HiveUtil.getColumnNames(io.prestosql.plugin.hive.HiveUtil.getColumnNames) PropertyMetadata(io.prestosql.spi.session.PropertyMetadata) Path(org.apache.hadoop.fs.Path) HIVE_PARTITION_SCHEMA_MISMATCH(io.prestosql.plugin.hive.HiveErrorCode.HIVE_PARTITION_SCHEMA_MISMATCH) Type(io.prestosql.spi.type.Type) ENGLISH(java.util.Locale.ENGLISH) PrestoException(io.prestosql.spi.PrestoException) ImmutableSet(com.google.common.collect.ImmutableSet) HIVE_WRITER_OPEN_ERROR(io.prestosql.plugin.hive.HiveErrorCode.HIVE_WRITER_OPEN_ERROR) ImmutableMap(com.google.common.collect.ImmutableMap) EventClient(io.airlift.event.client.EventClient) HIVE_UNSUPPORTED_FORMAT(io.prestosql.plugin.hive.HiveErrorCode.HIVE_UNSUPPORTED_FORMAT) DefaultCodec(org.apache.hadoop.io.compress.DefaultCodec) Set(java.util.Set) HIVE_PATH_ALREADY_EXISTS(io.prestosql.plugin.hive.HiveErrorCode.HIVE_PATH_ALREADY_EXISTS) Math.min(java.lang.Math.min) Collectors(java.util.stream.Collectors) HiveWriteUtils.createPartitionValues(io.prestosql.plugin.hive.HiveWriteUtils.createPartitionValues) Sets(com.google.common.collect.Sets) String.format(java.lang.String.format) Collectors.joining(java.util.stream.Collectors.joining) Preconditions.checkState(com.google.common.base.Preconditions.checkState) ReflectionUtil(org.apache.hive.common.util.ReflectionUtil) DataSize(io.airlift.units.DataSize) List(java.util.List) ImmutableMap.toImmutableMap(com.google.common.collect.ImmutableMap.toImmutableMap) Principal(java.security.Principal) Table(io.prestosql.plugin.hive.metastore.Table) HIVE_TABLE_READ_ONLY(io.prestosql.plugin.hive.HiveErrorCode.HIVE_TABLE_READ_ONLY) HdfsOrcDataSource(io.prestosql.plugin.hive.orc.HdfsOrcDataSource) Function.identity(java.util.function.Function.identity) FileUtils(org.apache.hadoop.hive.common.FileUtils) Optional(java.util.Optional) SortingColumn(io.prestosql.plugin.hive.metastore.SortingColumn) ConfigurationUtils.toJobConf(io.prestosql.plugin.hive.util.ConfigurationUtils.toJobConf) AcidUtils(org.apache.hadoop.hive.ql.io.AcidUtils) HivePageSinkMetadataProvider(io.prestosql.plugin.hive.metastore.HivePageSinkMetadataProvider) Partition(io.prestosql.plugin.hive.metastore.Partition) Logger(io.airlift.log.Logger) HiveUtil.getColumnTypes(io.prestosql.plugin.hive.HiveUtil.getColumnTypes) MEGABYTE(io.airlift.units.DataSize.Unit.MEGABYTE) HashMap(java.util.HashMap) StorageFormat.fromHiveStorageFormat(io.prestosql.plugin.hive.metastore.StorageFormat.fromHiveStorageFormat) IOConstants(org.apache.hadoop.hive.ql.io.IOConstants) TempFileReader(io.prestosql.plugin.hive.util.TempFileReader) NOT_FOUND(io.prestosql.spi.StandardErrorCode.NOT_FOUND) OptionalInt(java.util.OptionalInt) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) Strings(com.google.common.base.Strings) ImmutableList(com.google.common.collect.ImmutableList) COMPRESSRESULT(org.apache.hadoop.hive.conf.HiveConf.ConfVars.COMPRESSRESULT) Objects.requireNonNull(java.util.Objects.requireNonNull) HIVE_PARTITION_READ_ONLY(io.prestosql.plugin.hive.HiveErrorCode.HIVE_PARTITION_READ_ONLY) DIRECT_TO_TARGET_EXISTING_DIRECTORY(io.prestosql.plugin.hive.LocationHandle.WriteMode.DIRECT_TO_TARGET_EXISTING_DIRECTORY) Properties(java.util.Properties) InsertExistingPartitionsBehavior(io.prestosql.plugin.hive.HiveSessionProperties.InsertExistingPartitionsBehavior) HiveConf(org.apache.hadoop.hive.conf.HiveConf) TypeManager(io.prestosql.spi.type.TypeManager) AcidOutputFormat(org.apache.hadoop.hive.ql.io.AcidOutputFormat) HIVE_INVALID_METADATA(io.prestosql.plugin.hive.HiveErrorCode.HIVE_INVALID_METADATA) Page(io.prestosql.spi.Page) IOException(java.io.IOException) PageSorter(io.prestosql.spi.PageSorter) Options(org.apache.hadoop.hive.ql.io.AcidOutputFormat.Options) JobConf(org.apache.hadoop.mapred.JobConf) Consumer(java.util.function.Consumer) UUID.randomUUID(java.util.UUID.randomUUID) Collectors.toList(java.util.stream.Collectors.toList) MetastoreUtil.getHiveSchema(io.prestosql.plugin.hive.metastore.MetastoreUtil.getHiveSchema) Column(io.prestosql.plugin.hive.metastore.Column) NodeManager(io.prestosql.spi.NodeManager) WriteInfo(io.prestosql.plugin.hive.LocationService.WriteInfo) OrcDataSourceId(io.prestosql.orc.OrcDataSourceId) FileStatus(org.apache.hadoop.fs.FileStatus) OrcDataSourceId(io.prestosql.orc.OrcDataSourceId) HdfsOrcDataSource(io.prestosql.plugin.hive.orc.HdfsOrcDataSource) Type(io.prestosql.spi.type.Type) FileSystem(org.apache.hadoop.fs.FileSystem) DataSize(io.airlift.units.DataSize) TempFileReader(io.prestosql.plugin.hive.util.TempFileReader)

Example 2 with TempFileReader

use of io.prestosql.plugin.hive.util.TempFileReader in project hetu-core by openlookeng.

the class SortingFileWriter method mergeFiles.

private void mergeFiles(Iterable<TempFile> files, Consumer<Page> consumer) {
    try (Closer closer = Closer.create()) {
        Collection<Iterator<Page>> iterators = new ArrayList<>();
        for (TempFile tempFile : files) {
            Path file = tempFile.getPath();
            FileStatus fileStatus = fileSystem.getFileStatus(file);
            OrcDataSource dataSource = new HdfsOrcDataSource(new OrcDataSourceId(file.toString()), fileStatus.getLen(), new DataSize(1, MEGABYTE), new DataSize(8, MEGABYTE), new DataSize(8, MEGABYTE), false, fileSystem.open(file), new FileFormatDataSourceStats(), fileStatus.getModificationTime());
            TempFileReader reader = new TempFileReader(types, dataSource);
            // Closing the reader also closes the data source
            closer.register(reader);
            iterators.add(reader);
        }
        new MergingPageIterator(iterators, types, sortFields, sortOrders).forEachRemaining(consumer);
        for (TempFile tempFile : files) {
            Path file = tempFile.getPath();
            fileSystem.delete(file, false);
            if (fileSystem.exists(file)) {
                throw new IOException("Failed to delete temporary file: " + file);
            }
        }
    } catch (IOException e) {
        throw new UncheckedIOException(e);
    }
}
Also used : Closer(com.google.common.io.Closer) Path(org.apache.hadoop.fs.Path) OrcDataSource(io.prestosql.orc.OrcDataSource) HdfsOrcDataSource(io.prestosql.plugin.hive.orc.HdfsOrcDataSource) MergingPageIterator(io.prestosql.plugin.hive.util.MergingPageIterator) FileStatus(org.apache.hadoop.fs.FileStatus) OrcDataSourceId(io.prestosql.orc.OrcDataSourceId) ArrayList(java.util.ArrayList) HdfsOrcDataSource(io.prestosql.plugin.hive.orc.HdfsOrcDataSource) UncheckedIOException(java.io.UncheckedIOException) IOException(java.io.IOException) UncheckedIOException(java.io.UncheckedIOException) DataSize(io.airlift.units.DataSize) Iterator(java.util.Iterator) MergingPageIterator(io.prestosql.plugin.hive.util.MergingPageIterator) TempFileReader(io.prestosql.plugin.hive.util.TempFileReader)

Aggregations

DataSize (io.airlift.units.DataSize)2 OrcDataSourceId (io.prestosql.orc.OrcDataSourceId)2 HdfsOrcDataSource (io.prestosql.plugin.hive.orc.HdfsOrcDataSource)2 TempFileReader (io.prestosql.plugin.hive.util.TempFileReader)2 IOException (java.io.IOException)2 ArrayList (java.util.ArrayList)2 FileStatus (org.apache.hadoop.fs.FileStatus)2 Path (org.apache.hadoop.fs.Path)2 Preconditions.checkArgument (com.google.common.base.Preconditions.checkArgument)1 Preconditions.checkState (com.google.common.base.Preconditions.checkState)1 Strings (com.google.common.base.Strings)1 ImmutableList (com.google.common.collect.ImmutableList)1 ImmutableMap (com.google.common.collect.ImmutableMap)1 ImmutableMap.toImmutableMap (com.google.common.collect.ImmutableMap.toImmutableMap)1 ImmutableSet (com.google.common.collect.ImmutableSet)1 Sets (com.google.common.collect.Sets)1 Closer (com.google.common.io.Closer)1 EventClient (io.airlift.event.client.EventClient)1 Logger (io.airlift.log.Logger)1 MEGABYTE (io.airlift.units.DataSize.Unit.MEGABYTE)1