Search in sources :

Example 1 with AcidInfo

use of io.trino.plugin.hive.AcidInfo in project trino by trinodb.

the class OrcPageSourceFactory method createOrcPageSource.

private ConnectorPageSource createOrcPageSource(HdfsEnvironment hdfsEnvironment, ConnectorIdentity identity, Configuration configuration, Path path, long start, long length, long estimatedFileSize, List<HiveColumnHandle> columns, List<HiveColumnHandle> projections, boolean useOrcColumnNames, boolean isFullAcid, TupleDomain<HiveColumnHandle> effectivePredicate, DateTimeZone legacyFileTimeZone, OrcReaderOptions options, Optional<AcidInfo> acidInfo, OptionalInt bucketNumber, boolean originalFile, AcidTransaction transaction, FileFormatDataSourceStats stats) {
    for (HiveColumnHandle column : columns) {
        checkArgument(column.getColumnType() == REGULAR, "column type must be regular: %s", column);
    }
    checkArgument(!effectivePredicate.isNone());
    OrcDataSource orcDataSource;
    boolean originalFilesPresent = acidInfo.isPresent() && !acidInfo.get().getOriginalFiles().isEmpty();
    try {
        FileSystem fileSystem = hdfsEnvironment.getFileSystem(identity, path, configuration);
        FSDataInputStream inputStream = hdfsEnvironment.doAs(identity, () -> fileSystem.open(path));
        orcDataSource = new HdfsOrcDataSource(new OrcDataSourceId(path.toString()), estimatedFileSize, options, inputStream, stats);
    } catch (Exception e) {
        if (nullToEmpty(e.getMessage()).trim().equals("Filesystem closed") || e instanceof FileNotFoundException) {
            throw new TrinoException(HIVE_CANNOT_OPEN_SPLIT, e);
        }
        throw new TrinoException(HIVE_CANNOT_OPEN_SPLIT, splitError(e, path, start, length), e);
    }
    AggregatedMemoryContext memoryUsage = newSimpleAggregatedMemoryContext();
    try {
        Optional<OrcReader> optionalOrcReader = OrcReader.createOrcReader(orcDataSource, options);
        if (optionalOrcReader.isEmpty()) {
            return new EmptyPageSource();
        }
        OrcReader reader = optionalOrcReader.get();
        if (!originalFile && acidInfo.isPresent() && !acidInfo.get().isOrcAcidVersionValidated()) {
            validateOrcAcidVersion(path, reader);
        }
        List<OrcColumn> fileColumns = reader.getRootColumn().getNestedColumns();
        int actualColumnCount = columns.size() + (isFullAcid ? 3 : 0);
        List<OrcColumn> fileReadColumns = new ArrayList<>(actualColumnCount);
        List<Type> fileReadTypes = new ArrayList<>(actualColumnCount);
        List<OrcReader.ProjectedLayout> fileReadLayouts = new ArrayList<>(actualColumnCount);
        if (isFullAcid && !originalFilesPresent) {
            verifyAcidSchema(reader, path);
            Map<String, OrcColumn> acidColumnsByName = uniqueIndex(fileColumns, orcColumn -> orcColumn.getColumnName().toLowerCase(ENGLISH));
            fileColumns = ensureColumnNameConsistency(acidColumnsByName.get(AcidSchema.ACID_COLUMN_ROW_STRUCT.toLowerCase(ENGLISH)).getNestedColumns(), columns);
            fileReadColumns.add(acidColumnsByName.get(AcidSchema.ACID_COLUMN_ORIGINAL_TRANSACTION.toLowerCase(ENGLISH)));
            fileReadTypes.add(BIGINT);
            fileReadLayouts.add(fullyProjectedLayout());
            fileReadColumns.add(acidColumnsByName.get(AcidSchema.ACID_COLUMN_BUCKET.toLowerCase(ENGLISH)));
            fileReadTypes.add(INTEGER);
            fileReadLayouts.add(fullyProjectedLayout());
            fileReadColumns.add(acidColumnsByName.get(AcidSchema.ACID_COLUMN_ROW_ID.toLowerCase(ENGLISH)));
            fileReadTypes.add(BIGINT);
            fileReadLayouts.add(fullyProjectedLayout());
        }
        Map<String, OrcColumn> fileColumnsByName = ImmutableMap.of();
        if (useOrcColumnNames || isFullAcid) {
            verifyFileHasColumnNames(fileColumns, path);
            // Convert column names read from ORC files to lower case to be consistent with those stored in Hive Metastore
            fileColumnsByName = uniqueIndex(fileColumns, orcColumn -> orcColumn.getColumnName().toLowerCase(ENGLISH));
        }
        Map<String, List<List<String>>> projectionsByColumnName = ImmutableMap.of();
        Map<Integer, List<List<String>>> projectionsByColumnIndex = ImmutableMap.of();
        if (useOrcColumnNames || isFullAcid) {
            projectionsByColumnName = projections.stream().collect(Collectors.groupingBy(HiveColumnHandle::getBaseColumnName, mapping(OrcPageSourceFactory::getDereferencesAsList, toList())));
        } else {
            projectionsByColumnIndex = projections.stream().collect(Collectors.groupingBy(HiveColumnHandle::getBaseHiveColumnIndex, mapping(OrcPageSourceFactory::getDereferencesAsList, toList())));
        }
        TupleDomainOrcPredicateBuilder predicateBuilder = TupleDomainOrcPredicate.builder().setBloomFiltersEnabled(options.isBloomFiltersEnabled()).setDomainCompactionThreshold(domainCompactionThreshold);
        Map<HiveColumnHandle, Domain> effectivePredicateDomains = effectivePredicate.getDomains().orElseThrow(() -> new IllegalArgumentException("Effective predicate is none"));
        List<ColumnAdaptation> columnAdaptations = new ArrayList<>(columns.size());
        for (HiveColumnHandle column : columns) {
            OrcColumn orcColumn = null;
            OrcReader.ProjectedLayout projectedLayout = null;
            Map<Optional<HiveColumnProjectionInfo>, Domain> columnDomains = null;
            if (useOrcColumnNames || isFullAcid) {
                String columnName = column.getName().toLowerCase(ENGLISH);
                orcColumn = fileColumnsByName.get(columnName);
                if (orcColumn != null) {
                    projectedLayout = createProjectedLayout(orcColumn, projectionsByColumnName.get(columnName));
                    columnDomains = effectivePredicateDomains.entrySet().stream().filter(columnDomain -> columnDomain.getKey().getBaseColumnName().toLowerCase(ENGLISH).equals(columnName)).collect(toImmutableMap(columnDomain -> columnDomain.getKey().getHiveColumnProjectionInfo(), Map.Entry::getValue));
                }
            } else if (column.getBaseHiveColumnIndex() < fileColumns.size()) {
                orcColumn = fileColumns.get(column.getBaseHiveColumnIndex());
                if (orcColumn != null) {
                    projectedLayout = createProjectedLayout(orcColumn, projectionsByColumnIndex.get(column.getBaseHiveColumnIndex()));
                    columnDomains = effectivePredicateDomains.entrySet().stream().filter(columnDomain -> columnDomain.getKey().getBaseHiveColumnIndex() == column.getBaseHiveColumnIndex()).collect(toImmutableMap(columnDomain -> columnDomain.getKey().getHiveColumnProjectionInfo(), Map.Entry::getValue));
                }
            }
            Type readType = column.getType();
            if (orcColumn != null) {
                int sourceIndex = fileReadColumns.size();
                columnAdaptations.add(ColumnAdaptation.sourceColumn(sourceIndex));
                fileReadColumns.add(orcColumn);
                fileReadTypes.add(readType);
                fileReadLayouts.add(projectedLayout);
                // Add predicates on top-level and nested columns
                for (Map.Entry<Optional<HiveColumnProjectionInfo>, Domain> columnDomain : columnDomains.entrySet()) {
                    OrcColumn nestedColumn = getNestedColumn(orcColumn, columnDomain.getKey());
                    if (nestedColumn != null) {
                        predicateBuilder.addColumn(nestedColumn.getColumnId(), columnDomain.getValue());
                    }
                }
            } else {
                columnAdaptations.add(ColumnAdaptation.nullColumn(readType));
            }
        }
        OrcRecordReader recordReader = reader.createRecordReader(fileReadColumns, fileReadTypes, fileReadLayouts, predicateBuilder.build(), start, length, legacyFileTimeZone, memoryUsage, INITIAL_BATCH_SIZE, exception -> handleException(orcDataSource.getId(), exception), NameBasedFieldMapper::create);
        Optional<OrcDeletedRows> deletedRows = acidInfo.map(info -> new OrcDeletedRows(path.getName(), new OrcDeleteDeltaPageSourceFactory(options, identity, configuration, hdfsEnvironment, stats), identity, configuration, hdfsEnvironment, info, bucketNumber, memoryUsage));
        Optional<Long> originalFileRowId = acidInfo.filter(OrcPageSourceFactory::hasOriginalFiles).map(info -> OriginalFilesUtils.getPrecedingRowCount(acidInfo.get().getOriginalFiles(), path, hdfsEnvironment, identity, options, configuration, stats));
        if (transaction.isDelete()) {
            if (originalFile) {
                int bucket = bucketNumber.orElse(0);
                long startingRowId = originalFileRowId.orElse(0L);
                columnAdaptations.add(ColumnAdaptation.originalFileRowIdColumn(startingRowId, bucket));
            } else {
                columnAdaptations.add(ColumnAdaptation.rowIdColumn());
            }
        } else if (transaction.isUpdate()) {
            HiveUpdateProcessor updateProcessor = transaction.getUpdateProcessor().orElseThrow(() -> new IllegalArgumentException("updateProcessor not present"));
            List<HiveColumnHandle> dependencyColumns = projections.stream().filter(HiveColumnHandle::isBaseColumn).collect(toImmutableList());
            if (originalFile) {
                int bucket = bucketNumber.orElse(0);
                long startingRowId = originalFileRowId.orElse(0L);
                columnAdaptations.add(updatedRowColumnsWithOriginalFiles(startingRowId, bucket, updateProcessor, dependencyColumns));
            } else {
                columnAdaptations.add(updatedRowColumns(updateProcessor, dependencyColumns));
            }
        }
        return new OrcPageSource(recordReader, columnAdaptations, orcDataSource, deletedRows, originalFileRowId, memoryUsage, stats);
    } catch (Exception e) {
        try {
            orcDataSource.close();
        } catch (IOException ignored) {
        }
        if (e instanceof TrinoException) {
            throw (TrinoException) e;
        }
        String message = splitError(e, path, start, length);
        if (e instanceof BlockMissingException) {
            throw new TrinoException(HIVE_MISSING_DATA, message, e);
        }
        throw new TrinoException(HIVE_CANNOT_OPEN_SPLIT, message, e);
    }
}
Also used : DateTimeZone(org.joda.time.DateTimeZone) HiveUpdateProcessor(io.trino.plugin.hive.HiveUpdateProcessor) FileSystem(org.apache.hadoop.fs.FileSystem) HIVE_CANNOT_OPEN_SPLIT(io.trino.plugin.hive.HiveErrorCode.HIVE_CANNOT_OPEN_SPLIT) OrcTypeKind(io.trino.orc.metadata.OrcType.OrcTypeKind) Maps.uniqueIndex(com.google.common.collect.Maps.uniqueIndex) ColumnAdaptation(io.trino.plugin.hive.orc.OrcPageSource.ColumnAdaptation) BlockMissingException(org.apache.hadoop.hdfs.BlockMissingException) NOT_SUPPORTED(io.trino.spi.StandardErrorCode.NOT_SUPPORTED) HiveSessionProperties.getOrcLazyReadSmallRanges(io.trino.plugin.hive.HiveSessionProperties.getOrcLazyReadSmallRanges) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) ConnectorPageSource(io.trino.spi.connector.ConnectorPageSource) ENGLISH(java.util.Locale.ENGLISH) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) AcidTransaction(io.trino.plugin.hive.acid.AcidTransaction) HiveSessionProperties.getOrcTinyStripeThreshold(io.trino.plugin.hive.HiveSessionProperties.getOrcTinyStripeThreshold) FileFormatDataSourceStats(io.trino.plugin.hive.FileFormatDataSourceStats) HdfsEnvironment(io.trino.plugin.hive.HdfsEnvironment) ConnectorIdentity(io.trino.spi.security.ConnectorIdentity) Domain(io.trino.spi.predicate.Domain) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) ReaderColumns(io.trino.plugin.hive.ReaderColumns) ReaderPageSource(io.trino.plugin.hive.ReaderPageSource) ImmutableMap.toImmutableMap(com.google.common.collect.ImmutableMap.toImmutableMap) Slice(io.airlift.slice.Slice) STRUCT(io.trino.orc.metadata.OrcType.OrcTypeKind.STRUCT) ColumnAdaptation.updatedRowColumns(io.trino.plugin.hive.orc.OrcPageSource.ColumnAdaptation.updatedRowColumns) ArrayList(java.util.ArrayList) Collectors.toUnmodifiableList(java.util.stream.Collectors.toUnmodifiableList) TupleDomainOrcPredicateBuilder(io.trino.orc.TupleDomainOrcPredicate.TupleDomainOrcPredicateBuilder) LONG(io.trino.orc.metadata.OrcType.OrcTypeKind.LONG) AggregatedMemoryContext(io.trino.memory.context.AggregatedMemoryContext) INT(io.trino.orc.metadata.OrcType.OrcTypeKind.INT) HIVE_BAD_DATA(io.trino.plugin.hive.HiveErrorCode.HIVE_BAD_DATA) Properties(java.util.Properties) IOException(java.io.IOException) ConnectorSession(io.trino.spi.connector.ConnectorSession) NameBasedFieldMapper(io.trino.orc.NameBasedFieldMapper) HivePageSourceProvider.projectBaseColumns(io.trino.plugin.hive.HivePageSourceProvider.projectBaseColumns) HiveSessionProperties.isOrcNestedLazy(io.trino.plugin.hive.HiveSessionProperties.isOrcNestedLazy) OrcColumn(io.trino.orc.OrcColumn) HIVE_MISSING_DATA(io.trino.plugin.hive.HiveErrorCode.HIVE_MISSING_DATA) OrcSerde(org.apache.hadoop.hive.ql.io.orc.OrcSerde) Preconditions.checkArgument(com.google.common.base.Preconditions.checkArgument) OrcRecordReader(io.trino.orc.OrcRecordReader) Path(org.apache.hadoop.fs.Path) OrcDataSource(io.trino.orc.OrcDataSource) HiveColumnHandle(io.trino.plugin.hive.HiveColumnHandle) INTEGER(io.trino.spi.type.IntegerType.INTEGER) ImmutableMap(com.google.common.collect.ImmutableMap) AcidUtils.isFullAcidTable(org.apache.hadoop.hive.ql.io.AcidUtils.isFullAcidTable) INITIAL_BATCH_SIZE(io.trino.orc.OrcReader.INITIAL_BATCH_SIZE) OrcPageSource.handleException(io.trino.plugin.hive.orc.OrcPageSource.handleException) TrinoException(io.trino.spi.TrinoException) Collectors(java.util.stream.Collectors) FileNotFoundException(java.io.FileNotFoundException) String.format(java.lang.String.format) OrcDataSourceId(io.trino.orc.OrcDataSourceId) List(java.util.List) BIGINT(io.trino.spi.type.BigintType.BIGINT) Function.identity(java.util.function.Function.identity) Optional(java.util.Optional) PRESTO_WRITER_ID(io.trino.orc.metadata.OrcMetadataWriter.PRESTO_WRITER_ID) HivePageSourceFactory(io.trino.plugin.hive.HivePageSourceFactory) Pattern(java.util.regex.Pattern) TRINO_WRITER_ID(io.trino.orc.metadata.OrcMetadataWriter.TRINO_WRITER_ID) Strings.nullToEmpty(com.google.common.base.Strings.nullToEmpty) AggregatedMemoryContext.newSimpleAggregatedMemoryContext(io.trino.memory.context.AggregatedMemoryContext.newSimpleAggregatedMemoryContext) HiveUtil.isDeserializerClass(io.trino.plugin.hive.util.HiveUtil.isDeserializerClass) Type(io.trino.spi.type.Type) TupleDomainOrcPredicate(io.trino.orc.TupleDomainOrcPredicate) AcidSchema(io.trino.plugin.hive.acid.AcidSchema) HiveSessionProperties.isUseOrcColumnNames(io.trino.plugin.hive.HiveSessionProperties.isUseOrcColumnNames) OptionalInt(java.util.OptionalInt) Inject(javax.inject.Inject) HiveSessionProperties.getOrcStreamBufferSize(io.trino.plugin.hive.HiveSessionProperties.getOrcStreamBufferSize) ImmutableList(com.google.common.collect.ImmutableList) OrcReaderOptions(io.trino.orc.OrcReaderOptions) Objects.requireNonNull(java.util.Objects.requireNonNull) Collectors.mapping(java.util.stream.Collectors.mapping) HiveSessionProperties.isOrcBloomFiltersEnabled(io.trino.plugin.hive.HiveSessionProperties.isOrcBloomFiltersEnabled) HiveSessionProperties.getOrcMaxReadBlockSize(io.trino.plugin.hive.HiveSessionProperties.getOrcMaxReadBlockSize) OrcReader(io.trino.orc.OrcReader) HiveSessionProperties.getOrcMaxBufferSize(io.trino.plugin.hive.HiveSessionProperties.getOrcMaxBufferSize) NameBasedProjectedLayout.createProjectedLayout(io.trino.orc.OrcReader.NameBasedProjectedLayout.createProjectedLayout) UTF_8(java.nio.charset.StandardCharsets.UTF_8) TupleDomain(io.trino.spi.predicate.TupleDomain) OrcReader.fullyProjectedLayout(io.trino.orc.OrcReader.fullyProjectedLayout) Maps(com.google.common.collect.Maps) HiveSessionProperties.getOrcMaxMergeDistance(io.trino.plugin.hive.HiveSessionProperties.getOrcMaxMergeDistance) ColumnAdaptation.updatedRowColumnsWithOriginalFiles(io.trino.plugin.hive.orc.OrcPageSource.ColumnAdaptation.updatedRowColumnsWithOriginalFiles) AcidInfo(io.trino.plugin.hive.AcidInfo) HiveColumnProjectionInfo(io.trino.plugin.hive.HiveColumnProjectionInfo) Collectors.toList(java.util.stream.Collectors.toList) EmptyPageSource(io.trino.spi.connector.EmptyPageSource) HIVE_FILE_MISSING_COLUMN_NAMES(io.trino.plugin.hive.HiveErrorCode.HIVE_FILE_MISSING_COLUMN_NAMES) HiveConfig(io.trino.plugin.hive.HiveConfig) REGULAR(io.trino.plugin.hive.HiveColumnHandle.ColumnType.REGULAR) FileNotFoundException(java.io.FileNotFoundException) ArrayList(java.util.ArrayList) NameBasedFieldMapper(io.trino.orc.NameBasedFieldMapper) FileSystem(org.apache.hadoop.fs.FileSystem) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) ArrayList(java.util.ArrayList) Collectors.toUnmodifiableList(java.util.stream.Collectors.toUnmodifiableList) List(java.util.List) ImmutableList(com.google.common.collect.ImmutableList) Collectors.toList(java.util.stream.Collectors.toList) Optional(java.util.Optional) OrcColumn(io.trino.orc.OrcColumn) OrcReader(io.trino.orc.OrcReader) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) Domain(io.trino.spi.predicate.Domain) TupleDomain(io.trino.spi.predicate.TupleDomain) Map(java.util.Map) ImmutableMap.toImmutableMap(com.google.common.collect.ImmutableMap.toImmutableMap) ImmutableMap(com.google.common.collect.ImmutableMap) ColumnAdaptation(io.trino.plugin.hive.orc.OrcPageSource.ColumnAdaptation) EmptyPageSource(io.trino.spi.connector.EmptyPageSource) BlockMissingException(org.apache.hadoop.hdfs.BlockMissingException) HiveColumnHandle(io.trino.plugin.hive.HiveColumnHandle) HiveUpdateProcessor(io.trino.plugin.hive.HiveUpdateProcessor) OrcDataSource(io.trino.orc.OrcDataSource) OrcDataSourceId(io.trino.orc.OrcDataSourceId) IOException(java.io.IOException) OrcRecordReader(io.trino.orc.OrcRecordReader) AggregatedMemoryContext(io.trino.memory.context.AggregatedMemoryContext) AggregatedMemoryContext.newSimpleAggregatedMemoryContext(io.trino.memory.context.AggregatedMemoryContext.newSimpleAggregatedMemoryContext) BlockMissingException(org.apache.hadoop.hdfs.BlockMissingException) IOException(java.io.IOException) OrcPageSource.handleException(io.trino.plugin.hive.orc.OrcPageSource.handleException) TrinoException(io.trino.spi.TrinoException) FileNotFoundException(java.io.FileNotFoundException) Type(io.trino.spi.type.Type) TupleDomainOrcPredicateBuilder(io.trino.orc.TupleDomainOrcPredicate.TupleDomainOrcPredicateBuilder) TrinoException(io.trino.spi.TrinoException) NameBasedProjectedLayout.createProjectedLayout(io.trino.orc.OrcReader.NameBasedProjectedLayout.createProjectedLayout) OrcReader.fullyProjectedLayout(io.trino.orc.OrcReader.fullyProjectedLayout)

Example 2 with AcidInfo

use of io.trino.plugin.hive.AcidInfo in project trino by trinodb.

the class InternalHiveSplitFactory method createInternalHiveSplit.

private Optional<InternalHiveSplit> createInternalHiveSplit(Path path, BlockLocation[] blockLocations, long start, long length, // Estimated because, for example, encrypted S3 files may be padded, so reported size may not reflect actual size
long estimatedFileSize, long fileModificationTime, OptionalInt bucketNumber, boolean splittable, Optional<AcidInfo> acidInfo) {
    String pathString = path.toString();
    if (!pathMatchesPredicate(pathDomain, pathString)) {
        return Optional.empty();
    }
    // per HIVE-13040 empty files are allowed
    if (estimatedFileSize == 0) {
        return Optional.empty();
    }
    // but it might be ready when splits are enumerated lazily.
    if (!partitionMatchSupplier.getAsBoolean()) {
        return Optional.empty();
    }
    if (maxSplitFileSize.isPresent() && estimatedFileSize > maxSplitFileSize.get()) {
        return Optional.empty();
    }
    ImmutableList.Builder<InternalHiveBlock> blockBuilder = ImmutableList.builder();
    for (BlockLocation blockLocation : blockLocations) {
        // clamp the block range
        long blockStart = Math.max(start, blockLocation.getOffset());
        long blockEnd = Math.min(start + length, blockLocation.getOffset() + blockLocation.getLength());
        if (blockStart > blockEnd) {
            // block is outside split range
            continue;
        }
        if (blockStart == blockEnd && !(blockStart == start && blockEnd == start + length)) {
            // skip zero-width block, except in the special circumstance: slice is empty, and the block covers the empty slice interval.
            continue;
        }
        blockBuilder.add(new InternalHiveBlock(blockStart, blockEnd, getHostAddresses(blockLocation)));
    }
    List<InternalHiveBlock> blocks = blockBuilder.build();
    checkBlocks(path, blocks, start, length);
    if (!splittable) {
        // not splittable, use the hosts from the first block if it exists
        blocks = ImmutableList.of(new InternalHiveBlock(start, start + length, blocks.get(0).getAddresses()));
    }
    int bucketNumberIndex = bucketNumber.orElse(0);
    return Optional.of(new InternalHiveSplit(partitionName, pathString, start, start + length, estimatedFileSize, fileModificationTime, schema, partitionKeys, blocks, bucketNumber, () -> bucketStatementCounters.computeIfAbsent(bucketNumberIndex, index -> new AtomicInteger()).getAndIncrement(), splittable, forceLocalScheduling && allBlocksHaveAddress(blocks), tableToPartitionMapping, bucketConversion, bucketValidation, s3SelectPushdownEnabled && S3SelectPushdown.isCompressionCodecSupported(inputFormat, path), acidInfo, partitionMatchSupplier));
}
Also used : Arrays(java.util.Arrays) InternalHiveBlock(io.trino.plugin.hive.InternalHiveSplit.InternalHiveBlock) BlockLocation(org.apache.hadoop.fs.BlockLocation) FileSystem(org.apache.hadoop.fs.FileSystem) FileStatus(org.apache.hadoop.fs.FileStatus) OptionalInt(java.util.OptionalInt) BooleanSupplier(java.util.function.BooleanSupplier) Preconditions.checkArgument(com.google.common.base.Preconditions.checkArgument) FileSplit(org.apache.hadoop.mapred.FileSplit) ImmutableList(com.google.common.collect.ImmutableList) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) Map(java.util.Map) Objects.requireNonNull(java.util.Objects.requireNonNull) InputFormat(org.apache.hadoop.mapred.InputFormat) Path(org.apache.hadoop.fs.Path) Slices.utf8Slice(io.airlift.slice.Slices.utf8Slice) HiveColumnHandle.isPathColumnHandle(io.trino.plugin.hive.HiveColumnHandle.isPathColumnHandle) BucketConversion(io.trino.plugin.hive.HiveSplit.BucketConversion) HiveColumnHandle(io.trino.plugin.hive.HiveColumnHandle) InternalHiveSplit(io.trino.plugin.hive.InternalHiveSplit) AcidTransaction(io.trino.plugin.hive.acid.AcidTransaction) Properties(java.util.Properties) HivePartitionKey(io.trino.plugin.hive.HivePartitionKey) Domain(io.trino.spi.predicate.Domain) Collection(java.util.Collection) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) HiveSplit(io.trino.plugin.hive.HiveSplit) LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) HiveUtil.isSplittable(io.trino.plugin.hive.util.HiveUtil.isSplittable) IOException(java.io.IOException) TupleDomain(io.trino.spi.predicate.TupleDomain) AcidInfo(io.trino.plugin.hive.AcidInfo) UncheckedIOException(java.io.UncheckedIOException) DataSize(io.airlift.units.DataSize) List(java.util.List) TableToPartitionMapping(io.trino.plugin.hive.TableToPartitionMapping) S3SelectPushdown(io.trino.plugin.hive.s3select.S3SelectPushdown) Optional(java.util.Optional) HostAddress(io.trino.spi.HostAddress) InternalHiveBlock(io.trino.plugin.hive.InternalHiveSplit.InternalHiveBlock) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) ImmutableList(com.google.common.collect.ImmutableList) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) InternalHiveSplit(io.trino.plugin.hive.InternalHiveSplit) BlockLocation(org.apache.hadoop.fs.BlockLocation)

Example 3 with AcidInfo

use of io.trino.plugin.hive.AcidInfo in project trino by trinodb.

the class TestOrcPageSourceFactory method readFile.

private static List<Nation> readFile(Map<NationColumn, Integer> columns, OptionalLong nationKeyPredicate, Optional<AcidInfo> acidInfo, String filePath, long fileSize) {
    TupleDomain<HiveColumnHandle> tupleDomain = TupleDomain.all();
    if (nationKeyPredicate.isPresent()) {
        tupleDomain = TupleDomain.withColumnDomains(ImmutableMap.of(toHiveColumnHandle(NATION_KEY, 0), Domain.singleValue(INTEGER, nationKeyPredicate.getAsLong())));
    }
    List<HiveColumnHandle> columnHandles = columns.entrySet().stream().map(entry -> toHiveColumnHandle(entry.getKey(), entry.getValue())).collect(toImmutableList());
    List<String> columnNames = columnHandles.stream().map(HiveColumnHandle::getName).collect(toImmutableList());
    Optional<ReaderPageSource> pageSourceWithProjections = PAGE_SOURCE_FACTORY.createPageSource(new JobConf(new Configuration(false)), SESSION, new Path(filePath), 0, fileSize, fileSize, createSchema(), columnHandles, tupleDomain, acidInfo, OptionalInt.empty(), false, NO_ACID_TRANSACTION);
    checkArgument(pageSourceWithProjections.isPresent());
    checkArgument(pageSourceWithProjections.get().getReaderColumns().isEmpty(), "projected columns not expected here");
    ConnectorPageSource pageSource = pageSourceWithProjections.get().get();
    int nationKeyColumn = columnNames.indexOf("n_nationkey");
    int nameColumn = columnNames.indexOf("n_name");
    int regionKeyColumn = columnNames.indexOf("n_regionkey");
    int commentColumn = columnNames.indexOf("n_comment");
    ImmutableList.Builder<Nation> rows = ImmutableList.builder();
    while (!pageSource.isFinished()) {
        Page page = pageSource.getNextPage();
        if (page == null) {
            continue;
        }
        page = page.getLoadedPage();
        for (int position = 0; position < page.getPositionCount(); position++) {
            long nationKey = -42;
            if (nationKeyColumn >= 0) {
                nationKey = BIGINT.getLong(page.getBlock(nationKeyColumn), position);
            }
            String name = "<not read>";
            if (nameColumn >= 0) {
                name = VARCHAR.getSlice(page.getBlock(nameColumn), position).toStringUtf8();
            }
            long regionKey = -42;
            if (regionKeyColumn >= 0) {
                regionKey = BIGINT.getLong(page.getBlock(regionKeyColumn), position);
            }
            String comment = "<not read>";
            if (commentColumn >= 0) {
                comment = VARCHAR.getSlice(page.getBlock(commentColumn), position).toStringUtf8();
            }
            rows.add(new Nation(position, nationKey, name, regionKey, comment));
        }
    }
    return rows.build();
}
Also used : URISyntaxException(java.net.URISyntaxException) Test(org.testng.annotations.Test) NO_ACID_TRANSACTION(io.trino.plugin.hive.acid.AcidTransaction.NO_ACID_TRANSACTION) Preconditions.checkArgument(com.google.common.base.Preconditions.checkArgument) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) Assertions(org.assertj.core.api.Assertions) ConnectorPageSource(io.trino.spi.connector.ConnectorPageSource) LongPredicate(java.util.function.LongPredicate) HiveColumnHandle(io.trino.plugin.hive.HiveColumnHandle) INTEGER(io.trino.spi.type.IntegerType.INTEGER) Assert.assertFalse(org.testng.Assert.assertFalse) SERIALIZATION_LIB(org.apache.hadoop.hive.serde.serdeConstants.SERIALIZATION_LIB) FileFormatDataSourceStats(io.trino.plugin.hive.FileFormatDataSourceStats) ImmutableMap(com.google.common.collect.ImmutableMap) Collections.nCopies(java.util.Collections.nCopies) Domain(io.trino.spi.predicate.Domain) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) Set(java.util.Set) HDFS_ENVIRONMENT(io.trino.plugin.hive.HiveTestUtils.HDFS_ENVIRONMENT) AcidUtils.deleteDeltaSubdir(org.apache.hadoop.hive.ql.io.AcidUtils.deleteDeltaSubdir) REGION_KEY(io.trino.tpch.NationColumn.REGION_KEY) Nation(io.trino.tpch.Nation) NationGenerator(io.trino.tpch.NationGenerator) ReaderPageSource(io.trino.plugin.hive.ReaderPageSource) Resources.getResource(com.google.common.io.Resources.getResource) NATION_KEY(io.trino.tpch.NationColumn.NATION_KEY) List(java.util.List) BIGINT(io.trino.spi.type.BigintType.BIGINT) Optional(java.util.Optional) HivePageSourceFactory(io.trino.plugin.hive.HivePageSourceFactory) NAME(io.trino.tpch.NationColumn.NAME) Type(io.trino.spi.type.Type) Page(io.trino.spi.Page) Assert.assertEquals(org.testng.Assert.assertEquals) OptionalInt(java.util.OptionalInt) ArrayList(java.util.ArrayList) OptionalLong(java.util.OptionalLong) VARCHAR(io.trino.spi.type.VarcharType.VARCHAR) ImmutableList(com.google.common.collect.ImmutableList) HiveColumnHandle.createBaseColumn(io.trino.plugin.hive.HiveColumnHandle.createBaseColumn) COMMENT(io.trino.tpch.NationColumn.COMMENT) NationColumn(io.trino.tpch.NationColumn) HiveType.toHiveType(io.trino.plugin.hive.HiveType.toHiveType) Properties(java.util.Properties) ORC(io.trino.plugin.hive.HiveStorageFormat.ORC) TABLE_IS_TRANSACTIONAL(org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.TABLE_IS_TRANSACTIONAL) TupleDomain(io.trino.spi.predicate.TupleDomain) AcidInfo(io.trino.plugin.hive.AcidInfo) File(java.io.File) JobConf(org.apache.hadoop.mapred.JobConf) SESSION(io.trino.plugin.hive.HiveTestUtils.SESSION) FILE_INPUT_FORMAT(org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.FILE_INPUT_FORMAT) HiveConfig(io.trino.plugin.hive.HiveConfig) REGULAR(io.trino.plugin.hive.HiveColumnHandle.ColumnType.REGULAR) Path(org.apache.hadoop.fs.Path) Nation(io.trino.tpch.Nation) Configuration(org.apache.hadoop.conf.Configuration) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) ImmutableList(com.google.common.collect.ImmutableList) Page(io.trino.spi.Page) ConnectorPageSource(io.trino.spi.connector.ConnectorPageSource) ReaderPageSource(io.trino.plugin.hive.ReaderPageSource) JobConf(org.apache.hadoop.mapred.JobConf) HiveColumnHandle(io.trino.plugin.hive.HiveColumnHandle)

Example 4 with AcidInfo

use of io.trino.plugin.hive.AcidInfo in project trino by trinodb.

the class TestOrcPageSourceFactory method testFullFileReadOriginalFilesTable.

@Test
public void testFullFileReadOriginalFilesTable() throws Exception {
    File tableFile = new File(getResource("fullacidNationTableWithOriginalFiles/000000_0").toURI());
    String tablePath = tableFile.getParent();
    AcidInfo acidInfo = AcidInfo.builder(new Path(tablePath)).addDeleteDelta(new Path(tablePath, deleteDeltaSubdir(10000001, 10000001, 0))).addOriginalFile(new Path(tablePath, "000000_0"), 1780, 0).setOrcAcidVersionValidated(true).buildWithRequiredOriginalFiles(0);
    List<Nation> expected = expectedResult(OptionalLong.empty(), nationKey -> nationKey == 24, 1);
    List<Nation> result = readFile(ALL_COLUMNS, OptionalLong.empty(), Optional.of(acidInfo), tablePath + "/000000_0", 1780);
    assertEquals(result.size(), expected.size());
    int deletedRowKey = 24;
    String deletedRowNameColumn = "UNITED STATES";
    assertFalse(result.stream().anyMatch(acidNationRow -> acidNationRow.getName().equals(deletedRowNameColumn) && acidNationRow.getNationKey() == deletedRowKey), "Deleted row shouldn't be present in the result");
}
Also used : Path(org.apache.hadoop.fs.Path) Nation(io.trino.tpch.Nation) URISyntaxException(java.net.URISyntaxException) Test(org.testng.annotations.Test) NO_ACID_TRANSACTION(io.trino.plugin.hive.acid.AcidTransaction.NO_ACID_TRANSACTION) Preconditions.checkArgument(com.google.common.base.Preconditions.checkArgument) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) Assertions(org.assertj.core.api.Assertions) ConnectorPageSource(io.trino.spi.connector.ConnectorPageSource) LongPredicate(java.util.function.LongPredicate) HiveColumnHandle(io.trino.plugin.hive.HiveColumnHandle) INTEGER(io.trino.spi.type.IntegerType.INTEGER) Assert.assertFalse(org.testng.Assert.assertFalse) SERIALIZATION_LIB(org.apache.hadoop.hive.serde.serdeConstants.SERIALIZATION_LIB) FileFormatDataSourceStats(io.trino.plugin.hive.FileFormatDataSourceStats) ImmutableMap(com.google.common.collect.ImmutableMap) Collections.nCopies(java.util.Collections.nCopies) Domain(io.trino.spi.predicate.Domain) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) Set(java.util.Set) HDFS_ENVIRONMENT(io.trino.plugin.hive.HiveTestUtils.HDFS_ENVIRONMENT) AcidUtils.deleteDeltaSubdir(org.apache.hadoop.hive.ql.io.AcidUtils.deleteDeltaSubdir) REGION_KEY(io.trino.tpch.NationColumn.REGION_KEY) Nation(io.trino.tpch.Nation) NationGenerator(io.trino.tpch.NationGenerator) ReaderPageSource(io.trino.plugin.hive.ReaderPageSource) Resources.getResource(com.google.common.io.Resources.getResource) NATION_KEY(io.trino.tpch.NationColumn.NATION_KEY) List(java.util.List) BIGINT(io.trino.spi.type.BigintType.BIGINT) Optional(java.util.Optional) HivePageSourceFactory(io.trino.plugin.hive.HivePageSourceFactory) NAME(io.trino.tpch.NationColumn.NAME) Type(io.trino.spi.type.Type) Page(io.trino.spi.Page) Assert.assertEquals(org.testng.Assert.assertEquals) OptionalInt(java.util.OptionalInt) ArrayList(java.util.ArrayList) OptionalLong(java.util.OptionalLong) VARCHAR(io.trino.spi.type.VarcharType.VARCHAR) ImmutableList(com.google.common.collect.ImmutableList) HiveColumnHandle.createBaseColumn(io.trino.plugin.hive.HiveColumnHandle.createBaseColumn) COMMENT(io.trino.tpch.NationColumn.COMMENT) NationColumn(io.trino.tpch.NationColumn) HiveType.toHiveType(io.trino.plugin.hive.HiveType.toHiveType) Properties(java.util.Properties) ORC(io.trino.plugin.hive.HiveStorageFormat.ORC) TABLE_IS_TRANSACTIONAL(org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.TABLE_IS_TRANSACTIONAL) TupleDomain(io.trino.spi.predicate.TupleDomain) AcidInfo(io.trino.plugin.hive.AcidInfo) File(java.io.File) JobConf(org.apache.hadoop.mapred.JobConf) SESSION(io.trino.plugin.hive.HiveTestUtils.SESSION) FILE_INPUT_FORMAT(org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.FILE_INPUT_FORMAT) HiveConfig(io.trino.plugin.hive.HiveConfig) REGULAR(io.trino.plugin.hive.HiveColumnHandle.ColumnType.REGULAR) File(java.io.File) AcidInfo(io.trino.plugin.hive.AcidInfo) Test(org.testng.annotations.Test)

Example 5 with AcidInfo

use of io.trino.plugin.hive.AcidInfo in project trino by trinodb.

the class TestOrcPageSourceFactory method testReadWithAcidVersionValidationNoVersionInMetadata.

@Test
public void testReadWithAcidVersionValidationNoVersionInMetadata() throws Exception {
    File tableFile = new File(getResource("acid_version_validation/no_orc_acid_version_in_metadata/00000_0").toURI());
    String tablePath = tableFile.getParent();
    Optional<AcidInfo> acidInfo = AcidInfo.builder(new Path(tablePath)).setOrcAcidVersionValidated(false).build();
    Assertions.assertThatThrownBy(() -> readFile(Map.of(), OptionalLong.empty(), acidInfo, tableFile.getPath(), 730)).hasMessageMatching("Hive transactional tables are supported since Hive 3.0. Expected `hive.acid.version` in ORC metadata" + " in .*/acid_version_validation/no_orc_acid_version_in_metadata/00000_0 to be >=2 but was <empty>." + " If you have upgraded from an older version of Hive, make sure a major compaction has been run at least once after the upgrade.");
}
Also used : Path(org.apache.hadoop.fs.Path) File(java.io.File) AcidInfo(io.trino.plugin.hive.AcidInfo) Test(org.testng.annotations.Test)

Aggregations

AcidInfo (io.trino.plugin.hive.AcidInfo)7 Path (org.apache.hadoop.fs.Path)7 Test (org.testng.annotations.Test)5 Preconditions.checkArgument (com.google.common.base.Preconditions.checkArgument)4 ImmutableList (com.google.common.collect.ImmutableList)4 ImmutableList.toImmutableList (com.google.common.collect.ImmutableList.toImmutableList)4 HiveColumnHandle (io.trino.plugin.hive.HiveColumnHandle)4 Domain (io.trino.spi.predicate.Domain)4 TupleDomain (io.trino.spi.predicate.TupleDomain)4 File (java.io.File)4 List (java.util.List)4 Map (java.util.Map)4 Optional (java.util.Optional)4 OptionalInt (java.util.OptionalInt)4 Properties (java.util.Properties)4 ImmutableMap (com.google.common.collect.ImmutableMap)3 FileFormatDataSourceStats (io.trino.plugin.hive.FileFormatDataSourceStats)3 REGULAR (io.trino.plugin.hive.HiveColumnHandle.ColumnType.REGULAR)3 HiveConfig (io.trino.plugin.hive.HiveConfig)3 HivePageSourceFactory (io.trino.plugin.hive.HivePageSourceFactory)3