Search in sources :

Example 1 with ConnectorPageSource

use of io.trino.spi.connector.ConnectorPageSource in project trino by trinodb.

the class RuleStatsSystemTable method pageSource.

@Override
public ConnectorPageSource pageSource(ConnectorTransactionHandle transactionHandle, ConnectorSession session, TupleDomain<Integer> constraint) {
    checkState(ruleStatsRecorder.isPresent(), "Rule stats system table can return results only on coordinator");
    Map<Class<?>, RuleStats> ruleStats = ruleStatsRecorder.get().getStats();
    int positionCount = ruleStats.size();
    Map<String, BlockBuilder> blockBuilders = ruleStatsTable.getColumns().stream().collect(toImmutableMap(ColumnMetadata::getName, column -> column.getType().createBlockBuilder(null, positionCount)));
    for (Map.Entry<Class<?>, RuleStats> entry : ruleStats.entrySet()) {
        RuleStats stats = entry.getValue();
        VARCHAR.writeString(blockBuilders.get("rule_name"), entry.getKey().getSimpleName());
        BIGINT.writeLong(blockBuilders.get("invocations"), stats.getInvocations());
        BIGINT.writeLong(blockBuilders.get("matches"), stats.getHits());
        BIGINT.writeLong(blockBuilders.get("failures"), stats.getFailures());
        DOUBLE.writeDouble(blockBuilders.get("average_time"), stats.getTime().getAvg());
        BlockBuilder mapWriter = blockBuilders.get("time_distribution_percentiles").beginBlockEntry();
        for (Map.Entry<Double, Double> percentile : stats.getTime().getPercentiles().entrySet()) {
            DOUBLE.writeDouble(mapWriter, percentile.getKey());
            DOUBLE.writeDouble(mapWriter, percentile.getValue());
        }
        blockBuilders.get("time_distribution_percentiles").closeEntry();
    }
    Block[] blocks = ruleStatsTable.getColumns().stream().map(column -> blockBuilders.get(column.getName()).build()).toArray(Block[]::new);
    return new FixedPageSource(ImmutableList.of(new Page(positionCount, blocks)));
}
Also used : TableMetadataBuilder.tableMetadataBuilder(io.trino.metadata.MetadataUtil.TableMetadataBuilder.tableMetadataBuilder) ColumnMetadata(io.trino.spi.connector.ColumnMetadata) Page(io.trino.spi.Page) ConnectorTableMetadata(io.trino.spi.connector.ConnectorTableMetadata) RuleStatsRecorder(io.trino.sql.planner.RuleStatsRecorder) Inject(javax.inject.Inject) VARCHAR(io.trino.spi.type.VarcharType.VARCHAR) ImmutableList(com.google.common.collect.ImmutableList) Block(io.trino.spi.block.Block) Map(java.util.Map) Objects.requireNonNull(java.util.Objects.requireNonNull) ConnectorPageSource(io.trino.spi.connector.ConnectorPageSource) SINGLE_COORDINATOR(io.trino.spi.connector.SystemTable.Distribution.SINGLE_COORDINATOR) ConnectorSession(io.trino.spi.connector.ConnectorSession) TupleDomain(io.trino.spi.predicate.TupleDomain) SchemaTableName(io.trino.spi.connector.SchemaTableName) Preconditions.checkState(com.google.common.base.Preconditions.checkState) DOUBLE(io.trino.spi.type.DoubleType.DOUBLE) RuleStats(io.trino.sql.planner.iterative.RuleStats) ImmutableMap.toImmutableMap(com.google.common.collect.ImmutableMap.toImmutableMap) FixedPageSource(io.trino.spi.connector.FixedPageSource) BIGINT(io.trino.spi.type.BigintType.BIGINT) Optional(java.util.Optional) BlockBuilder(io.trino.spi.block.BlockBuilder) TypeSignature.mapType(io.trino.spi.type.TypeSignature.mapType) TypeManager(io.trino.spi.type.TypeManager) SystemTable(io.trino.spi.connector.SystemTable) ConnectorTransactionHandle(io.trino.spi.connector.ConnectorTransactionHandle) Page(io.trino.spi.Page) FixedPageSource(io.trino.spi.connector.FixedPageSource) RuleStats(io.trino.sql.planner.iterative.RuleStats) Block(io.trino.spi.block.Block) Map(java.util.Map) ImmutableMap.toImmutableMap(com.google.common.collect.ImmutableMap.toImmutableMap) BlockBuilder(io.trino.spi.block.BlockBuilder)

Example 2 with ConnectorPageSource

use of io.trino.spi.connector.ConnectorPageSource in project trino by trinodb.

the class ExtractSpatialJoins method loadKdbTree.

private static KdbTree loadKdbTree(String tableName, Session session, Metadata metadata, SplitManager splitManager, PageSourceManager pageSourceManager) {
    QualifiedObjectName name = toQualifiedObjectName(tableName, session.getCatalog().get(), session.getSchema().get());
    TableHandle tableHandle = metadata.getTableHandle(session, name).orElseThrow(() -> new TrinoException(INVALID_SPATIAL_PARTITIONING, format("Table not found: %s", name)));
    Map<String, ColumnHandle> columnHandles = metadata.getColumnHandles(session, tableHandle);
    List<ColumnHandle> visibleColumnHandles = columnHandles.values().stream().filter(handle -> !metadata.getColumnMetadata(session, tableHandle, handle).isHidden()).collect(toImmutableList());
    checkSpatialPartitioningTable(visibleColumnHandles.size() == 1, "Expected single column for table %s, but found %s columns", name, columnHandles.size());
    ColumnHandle kdbTreeColumn = Iterables.getOnlyElement(visibleColumnHandles);
    Optional<KdbTree> kdbTree = Optional.empty();
    try (SplitSource splitSource = splitManager.getSplits(session, tableHandle, UNGROUPED_SCHEDULING, EMPTY, alwaysTrue())) {
        while (!Thread.currentThread().isInterrupted()) {
            SplitBatch splitBatch = getFutureValue(splitSource.getNextBatch(NOT_PARTITIONED, Lifespan.taskWide(), 1000));
            List<Split> splits = splitBatch.getSplits();
            for (Split split : splits) {
                try (ConnectorPageSource pageSource = pageSourceManager.createPageSource(session, split, tableHandle, ImmutableList.of(kdbTreeColumn), DynamicFilter.EMPTY)) {
                    do {
                        getFutureValue(pageSource.isBlocked());
                        Page page = pageSource.getNextPage();
                        if (page != null && page.getPositionCount() > 0) {
                            checkSpatialPartitioningTable(kdbTree.isEmpty(), "Expected exactly one row for table %s, but found more", name);
                            checkSpatialPartitioningTable(page.getPositionCount() == 1, "Expected exactly one row for table %s, but found %s rows", name, page.getPositionCount());
                            String kdbTreeJson = VARCHAR.getSlice(page.getBlock(0), 0).toStringUtf8();
                            try {
                                kdbTree = Optional.of(KdbTreeUtils.fromJson(kdbTreeJson));
                            } catch (IllegalArgumentException e) {
                                checkSpatialPartitioningTable(false, "Invalid JSON string for KDB tree: %s", e.getMessage());
                            }
                        }
                    } while (!pageSource.isFinished());
                } catch (IOException e) {
                    throw new UncheckedIOException(e);
                }
            }
            if (splitBatch.isLastBatch()) {
                break;
            }
        }
    }
    checkSpatialPartitioningTable(kdbTree.isPresent(), "Expected exactly one row for table %s, but got none", name);
    return kdbTree.get();
}
Also used : EMPTY(io.trino.spi.connector.DynamicFilter.EMPTY) SpatialJoinUtils.extractSupportedSpatialComparisons(io.trino.util.SpatialJoinUtils.extractSupportedSpatialComparisons) SymbolsExtractor.extractUnique(io.trino.sql.planner.SymbolsExtractor.extractUnique) SplitBatch(io.trino.split.SplitSource.SplitBatch) SplitManager(io.trino.split.SplitManager) SystemSessionProperties.getSpatialPartitioningTableName(io.trino.SystemSessionProperties.getSpatialPartitioningTableName) FilterNode(io.trino.sql.planner.plan.FilterNode) PlanNode(io.trino.sql.planner.plan.PlanNode) LEFT(io.trino.sql.planner.plan.JoinNode.Type.LEFT) PlanNodeId(io.trino.sql.planner.plan.PlanNodeId) Map(java.util.Map) SpatialJoinNode(io.trino.sql.planner.plan.SpatialJoinNode) ConnectorPageSource(io.trino.spi.connector.ConnectorPageSource) JoinNode(io.trino.sql.planner.plan.JoinNode) INTEGER(io.trino.spi.type.IntegerType.INTEGER) Splitter(com.google.common.base.Splitter) FunctionCall(io.trino.sql.tree.FunctionCall) Patterns.join(io.trino.sql.planner.plan.Patterns.join) TypeSignature(io.trino.spi.type.TypeSignature) ImmutableSet(com.google.common.collect.ImmutableSet) ImmutableMap(com.google.common.collect.ImmutableMap) Collection(java.util.Collection) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) TypeSignatureTranslator.toSqlType(io.trino.sql.analyzer.TypeSignatureTranslator.toSqlType) KdbTree(io.trino.geospatial.KdbTree) Assignments(io.trino.sql.planner.plan.Assignments) Set(java.util.Set) TrinoException(io.trino.spi.TrinoException) ArrayType(io.trino.spi.type.ArrayType) SplitSource(io.trino.split.SplitSource) Context(io.trino.sql.planner.iterative.Rule.Context) ComparisonExpression(io.trino.sql.tree.ComparisonExpression) String.format(java.lang.String.format) Constraint.alwaysTrue(io.trino.spi.connector.Constraint.alwaysTrue) LESS_THAN_OR_EQUAL(io.trino.sql.tree.ComparisonExpression.Operator.LESS_THAN_OR_EQUAL) UncheckedIOException(java.io.UncheckedIOException) List(java.util.List) INVALID_SPATIAL_PARTITIONING(io.trino.spi.StandardErrorCode.INVALID_SPATIAL_PARTITIONING) NOT_PARTITIONED(io.trino.spi.connector.NotPartitionedPartitionHandle.NOT_PARTITIONED) Pattern(io.trino.matching.Pattern) SymbolReference(io.trino.sql.tree.SymbolReference) Split(io.trino.metadata.Split) DynamicFilter(io.trino.spi.connector.DynamicFilter) Optional(java.util.Optional) ExpressionNodeInliner.replaceExpression(io.trino.sql.planner.ExpressionNodeInliner.replaceExpression) Expression(io.trino.sql.tree.Expression) Session(io.trino.Session) PlannerContext(io.trino.sql.PlannerContext) Iterables(com.google.common.collect.Iterables) INNER(io.trino.sql.planner.plan.JoinNode.Type.INNER) Type(io.trino.spi.type.Type) Patterns.filter(io.trino.sql.planner.plan.Patterns.filter) Page(io.trino.spi.Page) Capture.newCapture(io.trino.matching.Capture.newCapture) Cast(io.trino.sql.tree.Cast) KdbTreeUtils(io.trino.geospatial.KdbTreeUtils) VARCHAR(io.trino.spi.type.VarcharType.VARCHAR) FunctionCallBuilder(io.trino.sql.planner.FunctionCallBuilder) ImmutableList(com.google.common.collect.ImmutableList) Verify.verify(com.google.common.base.Verify.verify) UNGROUPED_SCHEDULING(io.trino.spi.connector.ConnectorSplitManager.SplitSchedulingStrategy.UNGROUPED_SCHEDULING) Objects.requireNonNull(java.util.Objects.requireNonNull) Result(io.trino.sql.planner.iterative.Rule.Result) ColumnHandle(io.trino.spi.connector.ColumnHandle) Rule(io.trino.sql.planner.iterative.Rule) Lifespan(io.trino.execution.Lifespan) ProjectNode(io.trino.sql.planner.plan.ProjectNode) Symbol(io.trino.sql.planner.Symbol) StringLiteral(io.trino.sql.tree.StringLiteral) SystemSessionProperties.isSpatialJoinEnabled(io.trino.SystemSessionProperties.isSpatialJoinEnabled) IOException(java.io.IOException) PageSourceManager(io.trino.split.PageSourceManager) LESS_THAN(io.trino.sql.tree.ComparisonExpression.Operator.LESS_THAN) MoreFutures.getFutureValue(io.airlift.concurrent.MoreFutures.getFutureValue) UnnestNode(io.trino.sql.planner.plan.UnnestNode) Capture(io.trino.matching.Capture) QualifiedName(io.trino.sql.tree.QualifiedName) DOUBLE(io.trino.spi.type.DoubleType.DOUBLE) TableHandle(io.trino.metadata.TableHandle) TypeAnalyzer(io.trino.sql.planner.TypeAnalyzer) QualifiedObjectName(io.trino.metadata.QualifiedObjectName) Patterns.source(io.trino.sql.planner.plan.Patterns.source) Captures(io.trino.matching.Captures) Metadata(io.trino.metadata.Metadata) VisibleForTesting(com.google.common.annotations.VisibleForTesting) TypeManager(io.trino.spi.type.TypeManager) SpatialJoinUtils.extractSupportedSpatialFunctions(io.trino.util.SpatialJoinUtils.extractSupportedSpatialFunctions) ColumnHandle(io.trino.spi.connector.ColumnHandle) KdbTree(io.trino.geospatial.KdbTree) Page(io.trino.spi.Page) UncheckedIOException(java.io.UncheckedIOException) UncheckedIOException(java.io.UncheckedIOException) IOException(java.io.IOException) ConnectorPageSource(io.trino.spi.connector.ConnectorPageSource) QualifiedObjectName(io.trino.metadata.QualifiedObjectName) SplitBatch(io.trino.split.SplitSource.SplitBatch) TrinoException(io.trino.spi.TrinoException) TableHandle(io.trino.metadata.TableHandle) SplitSource(io.trino.split.SplitSource) Split(io.trino.metadata.Split)

Example 3 with ConnectorPageSource

use of io.trino.spi.connector.ConnectorPageSource in project trino by trinodb.

the class IndexSourceOperator method addSplit.

@Override
public Supplier<Optional<UpdatablePageSource>> addSplit(Split split) {
    requireNonNull(split, "split is null");
    checkState(source == null, "Index source split already set");
    IndexSplit indexSplit = (IndexSplit) split.getConnectorSplit();
    // Normalize the incoming RecordSet to something that can be consumed by the index
    RecordSet normalizedRecordSet = probeKeyNormalizer.apply(indexSplit.getKeyRecordSet());
    ConnectorPageSource result = index.lookup(normalizedRecordSet);
    source = new PageSourceOperator(result, operatorContext);
    Object splitInfo = split.getInfo();
    if (splitInfo != null) {
        operatorContext.setInfoSupplier(Suppliers.ofInstance(new SplitOperatorInfo(split.getCatalogName(), splitInfo)));
    }
    return Optional::empty;
}
Also used : SplitOperatorInfo(io.trino.operator.SplitOperatorInfo) RecordSet(io.trino.spi.connector.RecordSet) ConnectorPageSource(io.trino.spi.connector.ConnectorPageSource) PageSourceOperator(io.trino.operator.PageSourceOperator)

Example 4 with ConnectorPageSource

use of io.trino.spi.connector.ConnectorPageSource in project trino by trinodb.

the class HivePageSourceProvider method createPageSource.

@Override
public ConnectorPageSource createPageSource(ConnectorTransactionHandle transaction, ConnectorSession session, ConnectorSplit split, ConnectorTableHandle tableHandle, List<ColumnHandle> columns, DynamicFilter dynamicFilter) {
    HiveTableHandle hiveTable = (HiveTableHandle) tableHandle;
    HiveSplit hiveSplit = (HiveSplit) split;
    if (shouldSkipBucket(hiveTable, hiveSplit, dynamicFilter)) {
        return new EmptyPageSource();
    }
    List<HiveColumnHandle> hiveColumns = columns.stream().map(HiveColumnHandle.class::cast).collect(toList());
    List<HiveColumnHandle> dependencyColumns = hiveColumns.stream().filter(HiveColumnHandle::isBaseColumn).collect(toImmutableList());
    if (hiveTable.isAcidUpdate()) {
        hiveColumns = hiveTable.getUpdateProcessor().orElseThrow(() -> new IllegalArgumentException("update processor not present")).mergeWithNonUpdatedColumns(hiveColumns);
    }
    Path path = new Path(hiveSplit.getPath());
    boolean originalFile = ORIGINAL_FILE_PATH_MATCHER.matcher(path.toString()).matches();
    List<ColumnMapping> columnMappings = ColumnMapping.buildColumnMappings(hiveSplit.getPartitionName(), hiveSplit.getPartitionKeys(), hiveColumns, hiveSplit.getBucketConversion().map(BucketConversion::getBucketColumnHandles).orElse(ImmutableList.of()), hiveSplit.getTableToPartitionMapping(), path, hiveSplit.getBucketNumber(), hiveSplit.getEstimatedFileSize(), hiveSplit.getFileModifiedTime());
    // This can happen when dynamic filters are collected after partition splits were listed.
    if (shouldSkipSplit(columnMappings, dynamicFilter)) {
        return new EmptyPageSource();
    }
    Configuration configuration = hdfsEnvironment.getConfiguration(new HdfsContext(session), path);
    TupleDomain<HiveColumnHandle> simplifiedDynamicFilter = dynamicFilter.getCurrentPredicate().transformKeys(HiveColumnHandle.class::cast).simplify(domainCompactionThreshold);
    Optional<ConnectorPageSource> pageSource = createHivePageSource(pageSourceFactories, cursorProviders, configuration, session, path, hiveSplit.getBucketNumber(), hiveSplit.getStart(), hiveSplit.getLength(), hiveSplit.getEstimatedFileSize(), hiveSplit.getSchema(), hiveTable.getCompactEffectivePredicate().intersect(simplifiedDynamicFilter), hiveColumns, typeManager, hiveSplit.getBucketConversion(), hiveSplit.getBucketValidation(), hiveSplit.isS3SelectPushdownEnabled(), hiveSplit.getAcidInfo(), originalFile, hiveTable.getTransaction(), columnMappings);
    if (pageSource.isPresent()) {
        ConnectorPageSource source = pageSource.get();
        if (hiveTable.isAcidDelete() || hiveTable.isAcidUpdate()) {
            checkArgument(orcFileWriterFactory.isPresent(), "orcFileWriterFactory not supplied but required for DELETE and UPDATE");
            HivePageSource hivePageSource = (HivePageSource) source;
            OrcPageSource orcPageSource = (OrcPageSource) hivePageSource.getDelegate();
            ColumnMetadata<OrcType> columnMetadata = orcPageSource.getColumnTypes();
            int acidRowColumnId = originalFile ? 0 : ACID_ROW_STRUCT_COLUMN_ID;
            HiveType rowType = fromOrcTypeToHiveType(columnMetadata.get(new OrcColumnId(acidRowColumnId)), columnMetadata);
            long currentSplitNumber = hiveSplit.getSplitNumber();
            if (currentSplitNumber >= MAX_NUMBER_OF_SPLITS) {
                throw new TrinoException(GENERIC_INSUFFICIENT_RESOURCES, format("Number of splits is higher than maximum possible number of splits %d", MAX_NUMBER_OF_SPLITS));
            }
            long initialRowId = currentSplitNumber << PER_SPLIT_ROW_ID_BITS;
            return new HiveUpdatablePageSource(hiveTable, hiveSplit.getPartitionName(), hiveSplit.getStatementId(), source, typeManager, hiveSplit.getBucketNumber(), path, originalFile, orcFileWriterFactory.get(), configuration, session, rowType, dependencyColumns, hiveTable.getTransaction().getOperation(), initialRowId, MAX_NUMBER_OF_ROWS_PER_SPLIT);
        }
        return source;
    }
    throw new RuntimeException("Could not find a file reader for split " + hiveSplit);
}
Also used : OrcColumnId(io.trino.orc.metadata.OrcColumnId) Configuration(org.apache.hadoop.conf.Configuration) ConnectorPageSource(io.trino.spi.connector.ConnectorPageSource) EmptyPageSource(io.trino.spi.connector.EmptyPageSource) HdfsContext(io.trino.plugin.hive.HdfsEnvironment.HdfsContext) Path(org.apache.hadoop.fs.Path) OrcPageSource(io.trino.plugin.hive.orc.OrcPageSource) OrcType(io.trino.orc.metadata.OrcType) TrinoException(io.trino.spi.TrinoException) OrcTypeToHiveTypeTranslator.fromOrcTypeToHiveType(io.trino.plugin.hive.orc.OrcTypeToHiveTypeTranslator.fromOrcTypeToHiveType) BucketConversion(io.trino.plugin.hive.HiveSplit.BucketConversion)

Example 5 with ConnectorPageSource

use of io.trino.spi.connector.ConnectorPageSource in project trino by trinodb.

the class OrcPageSourceFactory method createOrcPageSource.

private ConnectorPageSource createOrcPageSource(HdfsEnvironment hdfsEnvironment, ConnectorIdentity identity, Configuration configuration, Path path, long start, long length, long estimatedFileSize, List<HiveColumnHandle> columns, List<HiveColumnHandle> projections, boolean useOrcColumnNames, boolean isFullAcid, TupleDomain<HiveColumnHandle> effectivePredicate, DateTimeZone legacyFileTimeZone, OrcReaderOptions options, Optional<AcidInfo> acidInfo, OptionalInt bucketNumber, boolean originalFile, AcidTransaction transaction, FileFormatDataSourceStats stats) {
    for (HiveColumnHandle column : columns) {
        checkArgument(column.getColumnType() == REGULAR, "column type must be regular: %s", column);
    }
    checkArgument(!effectivePredicate.isNone());
    OrcDataSource orcDataSource;
    boolean originalFilesPresent = acidInfo.isPresent() && !acidInfo.get().getOriginalFiles().isEmpty();
    try {
        FileSystem fileSystem = hdfsEnvironment.getFileSystem(identity, path, configuration);
        FSDataInputStream inputStream = hdfsEnvironment.doAs(identity, () -> fileSystem.open(path));
        orcDataSource = new HdfsOrcDataSource(new OrcDataSourceId(path.toString()), estimatedFileSize, options, inputStream, stats);
    } catch (Exception e) {
        if (nullToEmpty(e.getMessage()).trim().equals("Filesystem closed") || e instanceof FileNotFoundException) {
            throw new TrinoException(HIVE_CANNOT_OPEN_SPLIT, e);
        }
        throw new TrinoException(HIVE_CANNOT_OPEN_SPLIT, splitError(e, path, start, length), e);
    }
    AggregatedMemoryContext memoryUsage = newSimpleAggregatedMemoryContext();
    try {
        Optional<OrcReader> optionalOrcReader = OrcReader.createOrcReader(orcDataSource, options);
        if (optionalOrcReader.isEmpty()) {
            return new EmptyPageSource();
        }
        OrcReader reader = optionalOrcReader.get();
        if (!originalFile && acidInfo.isPresent() && !acidInfo.get().isOrcAcidVersionValidated()) {
            validateOrcAcidVersion(path, reader);
        }
        List<OrcColumn> fileColumns = reader.getRootColumn().getNestedColumns();
        int actualColumnCount = columns.size() + (isFullAcid ? 3 : 0);
        List<OrcColumn> fileReadColumns = new ArrayList<>(actualColumnCount);
        List<Type> fileReadTypes = new ArrayList<>(actualColumnCount);
        List<OrcReader.ProjectedLayout> fileReadLayouts = new ArrayList<>(actualColumnCount);
        if (isFullAcid && !originalFilesPresent) {
            verifyAcidSchema(reader, path);
            Map<String, OrcColumn> acidColumnsByName = uniqueIndex(fileColumns, orcColumn -> orcColumn.getColumnName().toLowerCase(ENGLISH));
            fileColumns = ensureColumnNameConsistency(acidColumnsByName.get(AcidSchema.ACID_COLUMN_ROW_STRUCT.toLowerCase(ENGLISH)).getNestedColumns(), columns);
            fileReadColumns.add(acidColumnsByName.get(AcidSchema.ACID_COLUMN_ORIGINAL_TRANSACTION.toLowerCase(ENGLISH)));
            fileReadTypes.add(BIGINT);
            fileReadLayouts.add(fullyProjectedLayout());
            fileReadColumns.add(acidColumnsByName.get(AcidSchema.ACID_COLUMN_BUCKET.toLowerCase(ENGLISH)));
            fileReadTypes.add(INTEGER);
            fileReadLayouts.add(fullyProjectedLayout());
            fileReadColumns.add(acidColumnsByName.get(AcidSchema.ACID_COLUMN_ROW_ID.toLowerCase(ENGLISH)));
            fileReadTypes.add(BIGINT);
            fileReadLayouts.add(fullyProjectedLayout());
        }
        Map<String, OrcColumn> fileColumnsByName = ImmutableMap.of();
        if (useOrcColumnNames || isFullAcid) {
            verifyFileHasColumnNames(fileColumns, path);
            // Convert column names read from ORC files to lower case to be consistent with those stored in Hive Metastore
            fileColumnsByName = uniqueIndex(fileColumns, orcColumn -> orcColumn.getColumnName().toLowerCase(ENGLISH));
        }
        Map<String, List<List<String>>> projectionsByColumnName = ImmutableMap.of();
        Map<Integer, List<List<String>>> projectionsByColumnIndex = ImmutableMap.of();
        if (useOrcColumnNames || isFullAcid) {
            projectionsByColumnName = projections.stream().collect(Collectors.groupingBy(HiveColumnHandle::getBaseColumnName, mapping(OrcPageSourceFactory::getDereferencesAsList, toList())));
        } else {
            projectionsByColumnIndex = projections.stream().collect(Collectors.groupingBy(HiveColumnHandle::getBaseHiveColumnIndex, mapping(OrcPageSourceFactory::getDereferencesAsList, toList())));
        }
        TupleDomainOrcPredicateBuilder predicateBuilder = TupleDomainOrcPredicate.builder().setBloomFiltersEnabled(options.isBloomFiltersEnabled()).setDomainCompactionThreshold(domainCompactionThreshold);
        Map<HiveColumnHandle, Domain> effectivePredicateDomains = effectivePredicate.getDomains().orElseThrow(() -> new IllegalArgumentException("Effective predicate is none"));
        List<ColumnAdaptation> columnAdaptations = new ArrayList<>(columns.size());
        for (HiveColumnHandle column : columns) {
            OrcColumn orcColumn = null;
            OrcReader.ProjectedLayout projectedLayout = null;
            Map<Optional<HiveColumnProjectionInfo>, Domain> columnDomains = null;
            if (useOrcColumnNames || isFullAcid) {
                String columnName = column.getName().toLowerCase(ENGLISH);
                orcColumn = fileColumnsByName.get(columnName);
                if (orcColumn != null) {
                    projectedLayout = createProjectedLayout(orcColumn, projectionsByColumnName.get(columnName));
                    columnDomains = effectivePredicateDomains.entrySet().stream().filter(columnDomain -> columnDomain.getKey().getBaseColumnName().toLowerCase(ENGLISH).equals(columnName)).collect(toImmutableMap(columnDomain -> columnDomain.getKey().getHiveColumnProjectionInfo(), Map.Entry::getValue));
                }
            } else if (column.getBaseHiveColumnIndex() < fileColumns.size()) {
                orcColumn = fileColumns.get(column.getBaseHiveColumnIndex());
                if (orcColumn != null) {
                    projectedLayout = createProjectedLayout(orcColumn, projectionsByColumnIndex.get(column.getBaseHiveColumnIndex()));
                    columnDomains = effectivePredicateDomains.entrySet().stream().filter(columnDomain -> columnDomain.getKey().getBaseHiveColumnIndex() == column.getBaseHiveColumnIndex()).collect(toImmutableMap(columnDomain -> columnDomain.getKey().getHiveColumnProjectionInfo(), Map.Entry::getValue));
                }
            }
            Type readType = column.getType();
            if (orcColumn != null) {
                int sourceIndex = fileReadColumns.size();
                columnAdaptations.add(ColumnAdaptation.sourceColumn(sourceIndex));
                fileReadColumns.add(orcColumn);
                fileReadTypes.add(readType);
                fileReadLayouts.add(projectedLayout);
                // Add predicates on top-level and nested columns
                for (Map.Entry<Optional<HiveColumnProjectionInfo>, Domain> columnDomain : columnDomains.entrySet()) {
                    OrcColumn nestedColumn = getNestedColumn(orcColumn, columnDomain.getKey());
                    if (nestedColumn != null) {
                        predicateBuilder.addColumn(nestedColumn.getColumnId(), columnDomain.getValue());
                    }
                }
            } else {
                columnAdaptations.add(ColumnAdaptation.nullColumn(readType));
            }
        }
        OrcRecordReader recordReader = reader.createRecordReader(fileReadColumns, fileReadTypes, fileReadLayouts, predicateBuilder.build(), start, length, legacyFileTimeZone, memoryUsage, INITIAL_BATCH_SIZE, exception -> handleException(orcDataSource.getId(), exception), NameBasedFieldMapper::create);
        Optional<OrcDeletedRows> deletedRows = acidInfo.map(info -> new OrcDeletedRows(path.getName(), new OrcDeleteDeltaPageSourceFactory(options, identity, configuration, hdfsEnvironment, stats), identity, configuration, hdfsEnvironment, info, bucketNumber, memoryUsage));
        Optional<Long> originalFileRowId = acidInfo.filter(OrcPageSourceFactory::hasOriginalFiles).map(info -> OriginalFilesUtils.getPrecedingRowCount(acidInfo.get().getOriginalFiles(), path, hdfsEnvironment, identity, options, configuration, stats));
        if (transaction.isDelete()) {
            if (originalFile) {
                int bucket = bucketNumber.orElse(0);
                long startingRowId = originalFileRowId.orElse(0L);
                columnAdaptations.add(ColumnAdaptation.originalFileRowIdColumn(startingRowId, bucket));
            } else {
                columnAdaptations.add(ColumnAdaptation.rowIdColumn());
            }
        } else if (transaction.isUpdate()) {
            HiveUpdateProcessor updateProcessor = transaction.getUpdateProcessor().orElseThrow(() -> new IllegalArgumentException("updateProcessor not present"));
            List<HiveColumnHandle> dependencyColumns = projections.stream().filter(HiveColumnHandle::isBaseColumn).collect(toImmutableList());
            if (originalFile) {
                int bucket = bucketNumber.orElse(0);
                long startingRowId = originalFileRowId.orElse(0L);
                columnAdaptations.add(updatedRowColumnsWithOriginalFiles(startingRowId, bucket, updateProcessor, dependencyColumns));
            } else {
                columnAdaptations.add(updatedRowColumns(updateProcessor, dependencyColumns));
            }
        }
        return new OrcPageSource(recordReader, columnAdaptations, orcDataSource, deletedRows, originalFileRowId, memoryUsage, stats);
    } catch (Exception e) {
        try {
            orcDataSource.close();
        } catch (IOException ignored) {
        }
        if (e instanceof TrinoException) {
            throw (TrinoException) e;
        }
        String message = splitError(e, path, start, length);
        if (e instanceof BlockMissingException) {
            throw new TrinoException(HIVE_MISSING_DATA, message, e);
        }
        throw new TrinoException(HIVE_CANNOT_OPEN_SPLIT, message, e);
    }
}
Also used : DateTimeZone(org.joda.time.DateTimeZone) HiveUpdateProcessor(io.trino.plugin.hive.HiveUpdateProcessor) FileSystem(org.apache.hadoop.fs.FileSystem) HIVE_CANNOT_OPEN_SPLIT(io.trino.plugin.hive.HiveErrorCode.HIVE_CANNOT_OPEN_SPLIT) OrcTypeKind(io.trino.orc.metadata.OrcType.OrcTypeKind) Maps.uniqueIndex(com.google.common.collect.Maps.uniqueIndex) ColumnAdaptation(io.trino.plugin.hive.orc.OrcPageSource.ColumnAdaptation) BlockMissingException(org.apache.hadoop.hdfs.BlockMissingException) NOT_SUPPORTED(io.trino.spi.StandardErrorCode.NOT_SUPPORTED) HiveSessionProperties.getOrcLazyReadSmallRanges(io.trino.plugin.hive.HiveSessionProperties.getOrcLazyReadSmallRanges) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) ConnectorPageSource(io.trino.spi.connector.ConnectorPageSource) ENGLISH(java.util.Locale.ENGLISH) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) AcidTransaction(io.trino.plugin.hive.acid.AcidTransaction) HiveSessionProperties.getOrcTinyStripeThreshold(io.trino.plugin.hive.HiveSessionProperties.getOrcTinyStripeThreshold) FileFormatDataSourceStats(io.trino.plugin.hive.FileFormatDataSourceStats) HdfsEnvironment(io.trino.plugin.hive.HdfsEnvironment) ConnectorIdentity(io.trino.spi.security.ConnectorIdentity) Domain(io.trino.spi.predicate.Domain) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) ReaderColumns(io.trino.plugin.hive.ReaderColumns) ReaderPageSource(io.trino.plugin.hive.ReaderPageSource) ImmutableMap.toImmutableMap(com.google.common.collect.ImmutableMap.toImmutableMap) Slice(io.airlift.slice.Slice) STRUCT(io.trino.orc.metadata.OrcType.OrcTypeKind.STRUCT) ColumnAdaptation.updatedRowColumns(io.trino.plugin.hive.orc.OrcPageSource.ColumnAdaptation.updatedRowColumns) ArrayList(java.util.ArrayList) Collectors.toUnmodifiableList(java.util.stream.Collectors.toUnmodifiableList) TupleDomainOrcPredicateBuilder(io.trino.orc.TupleDomainOrcPredicate.TupleDomainOrcPredicateBuilder) LONG(io.trino.orc.metadata.OrcType.OrcTypeKind.LONG) AggregatedMemoryContext(io.trino.memory.context.AggregatedMemoryContext) INT(io.trino.orc.metadata.OrcType.OrcTypeKind.INT) HIVE_BAD_DATA(io.trino.plugin.hive.HiveErrorCode.HIVE_BAD_DATA) Properties(java.util.Properties) IOException(java.io.IOException) ConnectorSession(io.trino.spi.connector.ConnectorSession) NameBasedFieldMapper(io.trino.orc.NameBasedFieldMapper) HivePageSourceProvider.projectBaseColumns(io.trino.plugin.hive.HivePageSourceProvider.projectBaseColumns) HiveSessionProperties.isOrcNestedLazy(io.trino.plugin.hive.HiveSessionProperties.isOrcNestedLazy) OrcColumn(io.trino.orc.OrcColumn) HIVE_MISSING_DATA(io.trino.plugin.hive.HiveErrorCode.HIVE_MISSING_DATA) OrcSerde(org.apache.hadoop.hive.ql.io.orc.OrcSerde) Preconditions.checkArgument(com.google.common.base.Preconditions.checkArgument) OrcRecordReader(io.trino.orc.OrcRecordReader) Path(org.apache.hadoop.fs.Path) OrcDataSource(io.trino.orc.OrcDataSource) HiveColumnHandle(io.trino.plugin.hive.HiveColumnHandle) INTEGER(io.trino.spi.type.IntegerType.INTEGER) ImmutableMap(com.google.common.collect.ImmutableMap) AcidUtils.isFullAcidTable(org.apache.hadoop.hive.ql.io.AcidUtils.isFullAcidTable) INITIAL_BATCH_SIZE(io.trino.orc.OrcReader.INITIAL_BATCH_SIZE) OrcPageSource.handleException(io.trino.plugin.hive.orc.OrcPageSource.handleException) TrinoException(io.trino.spi.TrinoException) Collectors(java.util.stream.Collectors) FileNotFoundException(java.io.FileNotFoundException) String.format(java.lang.String.format) OrcDataSourceId(io.trino.orc.OrcDataSourceId) List(java.util.List) BIGINT(io.trino.spi.type.BigintType.BIGINT) Function.identity(java.util.function.Function.identity) Optional(java.util.Optional) PRESTO_WRITER_ID(io.trino.orc.metadata.OrcMetadataWriter.PRESTO_WRITER_ID) HivePageSourceFactory(io.trino.plugin.hive.HivePageSourceFactory) Pattern(java.util.regex.Pattern) TRINO_WRITER_ID(io.trino.orc.metadata.OrcMetadataWriter.TRINO_WRITER_ID) Strings.nullToEmpty(com.google.common.base.Strings.nullToEmpty) AggregatedMemoryContext.newSimpleAggregatedMemoryContext(io.trino.memory.context.AggregatedMemoryContext.newSimpleAggregatedMemoryContext) HiveUtil.isDeserializerClass(io.trino.plugin.hive.util.HiveUtil.isDeserializerClass) Type(io.trino.spi.type.Type) TupleDomainOrcPredicate(io.trino.orc.TupleDomainOrcPredicate) AcidSchema(io.trino.plugin.hive.acid.AcidSchema) HiveSessionProperties.isUseOrcColumnNames(io.trino.plugin.hive.HiveSessionProperties.isUseOrcColumnNames) OptionalInt(java.util.OptionalInt) Inject(javax.inject.Inject) HiveSessionProperties.getOrcStreamBufferSize(io.trino.plugin.hive.HiveSessionProperties.getOrcStreamBufferSize) ImmutableList(com.google.common.collect.ImmutableList) OrcReaderOptions(io.trino.orc.OrcReaderOptions) Objects.requireNonNull(java.util.Objects.requireNonNull) Collectors.mapping(java.util.stream.Collectors.mapping) HiveSessionProperties.isOrcBloomFiltersEnabled(io.trino.plugin.hive.HiveSessionProperties.isOrcBloomFiltersEnabled) HiveSessionProperties.getOrcMaxReadBlockSize(io.trino.plugin.hive.HiveSessionProperties.getOrcMaxReadBlockSize) OrcReader(io.trino.orc.OrcReader) HiveSessionProperties.getOrcMaxBufferSize(io.trino.plugin.hive.HiveSessionProperties.getOrcMaxBufferSize) NameBasedProjectedLayout.createProjectedLayout(io.trino.orc.OrcReader.NameBasedProjectedLayout.createProjectedLayout) UTF_8(java.nio.charset.StandardCharsets.UTF_8) TupleDomain(io.trino.spi.predicate.TupleDomain) OrcReader.fullyProjectedLayout(io.trino.orc.OrcReader.fullyProjectedLayout) Maps(com.google.common.collect.Maps) HiveSessionProperties.getOrcMaxMergeDistance(io.trino.plugin.hive.HiveSessionProperties.getOrcMaxMergeDistance) ColumnAdaptation.updatedRowColumnsWithOriginalFiles(io.trino.plugin.hive.orc.OrcPageSource.ColumnAdaptation.updatedRowColumnsWithOriginalFiles) AcidInfo(io.trino.plugin.hive.AcidInfo) HiveColumnProjectionInfo(io.trino.plugin.hive.HiveColumnProjectionInfo) Collectors.toList(java.util.stream.Collectors.toList) EmptyPageSource(io.trino.spi.connector.EmptyPageSource) HIVE_FILE_MISSING_COLUMN_NAMES(io.trino.plugin.hive.HiveErrorCode.HIVE_FILE_MISSING_COLUMN_NAMES) HiveConfig(io.trino.plugin.hive.HiveConfig) REGULAR(io.trino.plugin.hive.HiveColumnHandle.ColumnType.REGULAR) FileNotFoundException(java.io.FileNotFoundException) ArrayList(java.util.ArrayList) NameBasedFieldMapper(io.trino.orc.NameBasedFieldMapper) FileSystem(org.apache.hadoop.fs.FileSystem) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) ArrayList(java.util.ArrayList) Collectors.toUnmodifiableList(java.util.stream.Collectors.toUnmodifiableList) List(java.util.List) ImmutableList(com.google.common.collect.ImmutableList) Collectors.toList(java.util.stream.Collectors.toList) Optional(java.util.Optional) OrcColumn(io.trino.orc.OrcColumn) OrcReader(io.trino.orc.OrcReader) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) Domain(io.trino.spi.predicate.Domain) TupleDomain(io.trino.spi.predicate.TupleDomain) Map(java.util.Map) ImmutableMap.toImmutableMap(com.google.common.collect.ImmutableMap.toImmutableMap) ImmutableMap(com.google.common.collect.ImmutableMap) ColumnAdaptation(io.trino.plugin.hive.orc.OrcPageSource.ColumnAdaptation) EmptyPageSource(io.trino.spi.connector.EmptyPageSource) BlockMissingException(org.apache.hadoop.hdfs.BlockMissingException) HiveColumnHandle(io.trino.plugin.hive.HiveColumnHandle) HiveUpdateProcessor(io.trino.plugin.hive.HiveUpdateProcessor) OrcDataSource(io.trino.orc.OrcDataSource) OrcDataSourceId(io.trino.orc.OrcDataSourceId) IOException(java.io.IOException) OrcRecordReader(io.trino.orc.OrcRecordReader) AggregatedMemoryContext(io.trino.memory.context.AggregatedMemoryContext) AggregatedMemoryContext.newSimpleAggregatedMemoryContext(io.trino.memory.context.AggregatedMemoryContext.newSimpleAggregatedMemoryContext) BlockMissingException(org.apache.hadoop.hdfs.BlockMissingException) IOException(java.io.IOException) OrcPageSource.handleException(io.trino.plugin.hive.orc.OrcPageSource.handleException) TrinoException(io.trino.spi.TrinoException) FileNotFoundException(java.io.FileNotFoundException) Type(io.trino.spi.type.Type) TupleDomainOrcPredicateBuilder(io.trino.orc.TupleDomainOrcPredicate.TupleDomainOrcPredicateBuilder) TrinoException(io.trino.spi.TrinoException) NameBasedProjectedLayout.createProjectedLayout(io.trino.orc.OrcReader.NameBasedProjectedLayout.createProjectedLayout) OrcReader.fullyProjectedLayout(io.trino.orc.OrcReader.fullyProjectedLayout)

Aggregations

ConnectorPageSource (io.trino.spi.connector.ConnectorPageSource)50 ConnectorSession (io.trino.spi.connector.ConnectorSession)23 Page (io.trino.spi.Page)18 Type (io.trino.spi.type.Type)18 Test (org.testng.annotations.Test)17 ImmutableList (com.google.common.collect.ImmutableList)16 MaterializedResult (io.trino.testing.MaterializedResult)14 ImmutableList.toImmutableList (com.google.common.collect.ImmutableList.toImmutableList)13 ColumnHandle (io.trino.spi.connector.ColumnHandle)13 ConnectorTableHandle (io.trino.spi.connector.ConnectorTableHandle)13 List (java.util.List)12 Optional (java.util.Optional)12 ConnectorSplit (io.trino.spi.connector.ConnectorSplit)11 ImmutableMap (com.google.common.collect.ImmutableMap)10 TestingConnectorSession (io.trino.testing.TestingConnectorSession)10 File (java.io.File)10 Path (org.apache.hadoop.fs.Path)10 TupleDomain (io.trino.spi.predicate.TupleDomain)9 IOException (java.io.IOException)9 ArrayList (java.util.ArrayList)9