Search in sources :

Example 6 with EmptyPageSource

use of io.trino.spi.connector.EmptyPageSource in project trino by trinodb.

the class TableScanOperator method addSplit.

@Override
public Supplier<Optional<UpdatablePageSource>> addSplit(Split split) {
    requireNonNull(split, "split is null");
    checkState(this.split == null, "Table scan split already set");
    if (finished) {
        return Optional::empty;
    }
    this.split = split;
    Object splitInfo = split.getInfo();
    if (splitInfo != null) {
        operatorContext.setInfoSupplier(Suppliers.ofInstance(new SplitOperatorInfo(split.getCatalogName(), splitInfo)));
    }
    blocked.set(null);
    if (split.getConnectorSplit() instanceof EmptySplit) {
        source = new EmptyPageSource();
    }
    return () -> {
        if (source instanceof UpdatablePageSource) {
            return Optional.of((UpdatablePageSource) source);
        }
        return Optional.empty();
    };
}
Also used : UpdatablePageSource(io.trino.spi.connector.UpdatablePageSource) EmptyPageSource(io.trino.spi.connector.EmptyPageSource) EmptySplit(io.trino.split.EmptySplit)

Example 7 with EmptyPageSource

use of io.trino.spi.connector.EmptyPageSource in project trino by trinodb.

the class DeltaLakePageSourceProvider method createPageSource.

@Override
public ConnectorPageSource createPageSource(ConnectorTransactionHandle transaction, ConnectorSession session, ConnectorSplit connectorSplit, ConnectorTableHandle connectorTable, List<ColumnHandle> columns, DynamicFilter dynamicFilter) {
    DeltaLakeSplit split = (DeltaLakeSplit) connectorSplit;
    DeltaLakeTableHandle table = (DeltaLakeTableHandle) connectorTable;
    // We reach here when we could not prune the split using file level stats, table predicate
    // and the dynamic filter in the coordinator during split generation. The file level stats
    // in DeltaLakeSplit#filePredicate could help to prune this split when a more selective dynamic filter
    // is available now, without having to access parquet file footer for row-group stats.
    // We avoid sending DeltaLakeSplit#splitPredicate to workers by using table.getPredicate() here.
    TupleDomain<DeltaLakeColumnHandle> filteredSplitPredicate = TupleDomain.intersect(ImmutableList.of(table.getNonPartitionConstraint(), split.getStatisticsPredicate(), dynamicFilter.getCurrentPredicate().transformKeys(DeltaLakeColumnHandle.class::cast)));
    if (filteredSplitPredicate.isNone()) {
        return new EmptyPageSource();
    }
    List<DeltaLakeColumnHandle> deltaLakeColumns = columns.stream().map(DeltaLakeColumnHandle.class::cast).collect(toImmutableList());
    Map<String, Optional<String>> partitionKeys = split.getPartitionKeys();
    List<DeltaLakeColumnHandle> regularColumns = deltaLakeColumns.stream().filter(column -> column.getColumnType() == REGULAR).collect(toImmutableList());
    List<HiveColumnHandle> hiveColumnHandles = regularColumns.stream().map(DeltaLakeColumnHandle::toHiveColumnHandle).collect(toImmutableList());
    Path path = new Path(split.getPath());
    HdfsContext hdfsContext = new HdfsContext(session);
    TupleDomain<HiveColumnHandle> parquetPredicate = getParquetTupleDomain(filteredSplitPredicate.simplify(domainCompactionThreshold));
    if (table.getWriteType().isPresent()) {
        return new DeltaLakeUpdatablePageSource(table, deltaLakeColumns, partitionKeys, split.getPath(), split.getFileSize(), split.getFileModifiedTime(), session, executorService, hdfsEnvironment, hdfsContext, parquetDateTimeZone, parquetReaderOptions, parquetPredicate, typeManager, updateResultJsonCodec);
    }
    ReaderPageSource pageSource = ParquetPageSourceFactory.createPageSource(path, split.getStart(), split.getLength(), split.getFileSize(), hiveColumnHandles, parquetPredicate, true, hdfsEnvironment, hdfsEnvironment.getConfiguration(hdfsContext, path), session.getIdentity(), parquetDateTimeZone, fileFormatDataSourceStats, parquetReaderOptions.withMaxReadBlockSize(getParquetMaxReadBlockSize(session)).withUseColumnIndex(isParquetUseColumnIndex(session)));
    verify(pageSource.getReaderColumns().isEmpty(), "All columns expected to be base columns");
    return new DeltaLakePageSource(deltaLakeColumns, partitionKeys, pageSource.get(), split.getPath(), split.getFileSize(), split.getFileModifiedTime());
}
Also used : DateTimeZone(org.joda.time.DateTimeZone) HiveSessionProperties.isParquetUseColumnIndex(io.trino.plugin.hive.HiveSessionProperties.isParquetUseColumnIndex) Inject(javax.inject.Inject) ParquetPageSourceFactory(io.trino.plugin.hive.parquet.ParquetPageSourceFactory) ImmutableList(com.google.common.collect.ImmutableList) Verify.verify(com.google.common.base.Verify.verify) ConnectorTableHandle(io.trino.spi.connector.ConnectorTableHandle) Map(java.util.Map) Objects.requireNonNull(java.util.Objects.requireNonNull) ColumnHandle(io.trino.spi.connector.ColumnHandle) Path(org.apache.hadoop.fs.Path) ConnectorPageSource(io.trino.spi.connector.ConnectorPageSource) HiveColumnHandle(io.trino.plugin.hive.HiveColumnHandle) ExecutorService(java.util.concurrent.ExecutorService) ParquetReaderOptions(io.trino.parquet.ParquetReaderOptions) FileFormatDataSourceStats(io.trino.plugin.hive.FileFormatDataSourceStats) HdfsEnvironment(io.trino.plugin.hive.HdfsEnvironment) ImmutableMap(com.google.common.collect.ImmutableMap) ConnectorSplit(io.trino.spi.connector.ConnectorSplit) Domain(io.trino.spi.predicate.Domain) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) ConnectorPageSourceProvider(io.trino.spi.connector.ConnectorPageSourceProvider) StandardTypes(io.trino.spi.type.StandardTypes) ConnectorSession(io.trino.spi.connector.ConnectorSession) TupleDomain(io.trino.spi.predicate.TupleDomain) ReaderPageSource(io.trino.plugin.hive.ReaderPageSource) HdfsContext(io.trino.plugin.hive.HdfsEnvironment.HdfsContext) List(java.util.List) DeltaLakeSessionProperties.getParquetMaxReadBlockSize(io.trino.plugin.deltalake.DeltaLakeSessionProperties.getParquetMaxReadBlockSize) DynamicFilter(io.trino.spi.connector.DynamicFilter) Optional(java.util.Optional) ParquetReaderConfig(io.trino.plugin.hive.parquet.ParquetReaderConfig) EmptyPageSource(io.trino.spi.connector.EmptyPageSource) TypeManager(io.trino.spi.type.TypeManager) HiveConfig(io.trino.plugin.hive.HiveConfig) REGULAR(io.trino.plugin.deltalake.DeltaLakeColumnType.REGULAR) JsonCodec(io.airlift.json.JsonCodec) ConnectorTransactionHandle(io.trino.spi.connector.ConnectorTransactionHandle) Path(org.apache.hadoop.fs.Path) Optional(java.util.Optional) EmptyPageSource(io.trino.spi.connector.EmptyPageSource) ReaderPageSource(io.trino.plugin.hive.ReaderPageSource) HdfsContext(io.trino.plugin.hive.HdfsEnvironment.HdfsContext) HiveColumnHandle(io.trino.plugin.hive.HiveColumnHandle)

Example 8 with EmptyPageSource

use of io.trino.spi.connector.EmptyPageSource in project trino by trinodb.

the class HivePageSourceProvider method createHivePageSource.

public static Optional<ConnectorPageSource> createHivePageSource(Set<HivePageSourceFactory> pageSourceFactories, Set<HiveRecordCursorProvider> cursorProviders, Configuration configuration, ConnectorSession session, Path path, OptionalInt bucketNumber, long start, long length, long estimatedFileSize, Properties schema, TupleDomain<HiveColumnHandle> effectivePredicate, List<HiveColumnHandle> columns, TypeManager typeManager, Optional<BucketConversion> bucketConversion, Optional<BucketValidation> bucketValidation, boolean s3SelectPushdownEnabled, Optional<AcidInfo> acidInfo, boolean originalFile, AcidTransaction transaction, List<ColumnMapping> columnMappings) {
    if (effectivePredicate.isNone()) {
        return Optional.of(new EmptyPageSource());
    }
    List<ColumnMapping> regularAndInterimColumnMappings = ColumnMapping.extractRegularAndInterimColumnMappings(columnMappings);
    Optional<BucketAdaptation> bucketAdaptation = createBucketAdaptation(bucketConversion, bucketNumber, regularAndInterimColumnMappings);
    Optional<BucketValidator> bucketValidator = createBucketValidator(path, bucketValidation, bucketNumber, regularAndInterimColumnMappings);
    for (HivePageSourceFactory pageSourceFactory : pageSourceFactories) {
        List<HiveColumnHandle> desiredColumns = toColumnHandles(regularAndInterimColumnMappings, true, typeManager);
        Optional<ReaderPageSource> readerWithProjections = pageSourceFactory.createPageSource(configuration, session, path, start, length, estimatedFileSize, schema, desiredColumns, effectivePredicate, acidInfo, bucketNumber, originalFile, transaction);
        if (readerWithProjections.isPresent()) {
            ConnectorPageSource pageSource = readerWithProjections.get().get();
            Optional<ReaderColumns> readerProjections = readerWithProjections.get().getReaderColumns();
            Optional<ReaderProjectionsAdapter> adapter = Optional.empty();
            if (readerProjections.isPresent()) {
                adapter = Optional.of(hiveProjectionsAdapter(desiredColumns, readerProjections.get()));
            }
            return Optional.of(new HivePageSource(columnMappings, bucketAdaptation, bucketValidator, adapter, typeManager, pageSource));
        }
    }
    for (HiveRecordCursorProvider provider : cursorProviders) {
        // GenericHiveRecordCursor will automatically do the coercion without HiveCoercionRecordCursor
        boolean doCoercion = !(provider instanceof GenericHiveRecordCursorProvider);
        List<HiveColumnHandle> desiredColumns = toColumnHandles(regularAndInterimColumnMappings, doCoercion, typeManager);
        Optional<ReaderRecordCursorWithProjections> readerWithProjections = provider.createRecordCursor(configuration, session, path, start, length, estimatedFileSize, schema, desiredColumns, effectivePredicate, typeManager, s3SelectPushdownEnabled);
        if (readerWithProjections.isPresent()) {
            RecordCursor delegate = readerWithProjections.get().getRecordCursor();
            Optional<ReaderColumns> projections = readerWithProjections.get().getProjectedReaderColumns();
            if (projections.isPresent()) {
                ReaderProjectionsAdapter projectionsAdapter = hiveProjectionsAdapter(desiredColumns, projections.get());
                delegate = new HiveReaderProjectionsAdaptingRecordCursor(delegate, projectionsAdapter);
            }
            checkArgument(acidInfo.isEmpty(), "Acid is not supported");
            if (bucketAdaptation.isPresent()) {
                delegate = new HiveBucketAdapterRecordCursor(bucketAdaptation.get().getBucketColumnIndices(), bucketAdaptation.get().getBucketColumnHiveTypes(), bucketAdaptation.get().getBucketingVersion(), bucketAdaptation.get().getTableBucketCount(), bucketAdaptation.get().getPartitionBucketCount(), bucketAdaptation.get().getBucketToKeep(), typeManager, delegate);
            }
            // Need to wrap RcText and RcBinary into a wrapper, which will do the coercion for mismatch columns
            if (doCoercion) {
                delegate = new HiveCoercionRecordCursor(regularAndInterimColumnMappings, typeManager, delegate);
            }
            // bucket adaptation already validates that data is in the right bucket
            if (bucketAdaptation.isEmpty() && bucketValidator.isPresent()) {
                delegate = bucketValidator.get().wrapRecordCursor(delegate, typeManager);
            }
            HiveRecordCursor hiveRecordCursor = new HiveRecordCursor(columnMappings, delegate);
            List<Type> columnTypes = columns.stream().map(HiveColumnHandle::getType).collect(toList());
            return Optional.of(new RecordPageSource(columnTypes, hiveRecordCursor));
        }
    }
    return Optional.empty();
}
Also used : BucketValidator(io.trino.plugin.hive.HivePageSource.BucketValidator) RecordCursor(io.trino.spi.connector.RecordCursor) ConnectorPageSource(io.trino.spi.connector.ConnectorPageSource) RecordPageSource(io.trino.spi.connector.RecordPageSource) EmptyPageSource(io.trino.spi.connector.EmptyPageSource) Type(io.trino.spi.type.Type) OrcTypeToHiveTypeTranslator.fromOrcTypeToHiveType(io.trino.plugin.hive.orc.OrcTypeToHiveTypeTranslator.fromOrcTypeToHiveType) OrcType(io.trino.orc.metadata.OrcType) ReaderRecordCursorWithProjections(io.trino.plugin.hive.HiveRecordCursorProvider.ReaderRecordCursorWithProjections)

Aggregations

EmptyPageSource (io.trino.spi.connector.EmptyPageSource)8 ConnectorPageSource (io.trino.spi.connector.ConnectorPageSource)5 TrinoException (io.trino.spi.TrinoException)4 ImmutableList (com.google.common.collect.ImmutableList)3 ImmutableList.toImmutableList (com.google.common.collect.ImmutableList.toImmutableList)3 ImmutableMap (com.google.common.collect.ImmutableMap)3 HiveColumnHandle (io.trino.plugin.hive.HiveColumnHandle)3 ReaderPageSource (io.trino.plugin.hive.ReaderPageSource)3 ColumnHandle (io.trino.spi.connector.ColumnHandle)3 Type (io.trino.spi.type.Type)3 ReaderColumns (io.trino.plugin.hive.ReaderColumns)2 FileNotFoundException (java.io.FileNotFoundException)2 IOException (java.io.IOException)2 Configuration (org.apache.hadoop.conf.Configuration)2 FSDataInputStream (org.apache.hadoop.fs.FSDataInputStream)2 FileSystem (org.apache.hadoop.fs.FileSystem)2 Path (org.apache.hadoop.fs.Path)2 BlockMissingException (org.apache.hadoop.hdfs.BlockMissingException)2 Preconditions.checkArgument (com.google.common.base.Preconditions.checkArgument)1 Strings.nullToEmpty (com.google.common.base.Strings.nullToEmpty)1