Examples with ICEBERG_DOMAIN_COMPACTION_THRESHOLD - io.trino.plugin.iceberg.IcebergSplitManager.ICEBERG_DOMAIN_COMPACTION

Example 1 with ICEBERG_DOMAIN_COMPACTION_THRESHOLD

use of io.trino.plugin.iceberg.IcebergSplitManager.ICEBERG_DOMAIN_COMPACTION_THRESHOLD in project trino by trinodb.

the class BaseIcebergConnectorTest method testPredicatePushdown.

@Test
public void testPredicatePushdown() {
    QualifiedObjectName tableName = new QualifiedObjectName("iceberg", "tpch", "test_predicate");
    assertUpdate(format("CREATE TABLE %s (col1 BIGINT, col2 BIGINT, col3 BIGINT) WITH (partitioning = ARRAY['col2', 'col3'])", tableName));
    assertUpdate(format("INSERT INTO %s VALUES (1, 10, 100)", tableName), 1L);
    assertUpdate(format("INSERT INTO %s VALUES (2, 20, 200)", tableName), 1L);
    assertQuery(format("SELECT * FROM %s WHERE col1 = 1", tableName), "VALUES (1, 10, 100)");
    assertFilterPushdown(tableName, ImmutableMap.of("col1", singleValue(BIGINT, 1L)), ImmutableMap.of(), ImmutableMap.of("col1", singleValue(BIGINT, 1L)));
    assertQuery(format("SELECT * FROM %s WHERE col2 = 10", tableName), "VALUES (1, 10, 100)");
    assertFilterPushdown(tableName, ImmutableMap.of("col2", singleValue(BIGINT, 10L)), ImmutableMap.of("col2", singleValue(BIGINT, 10L)), ImmutableMap.of());
    assertQuery(format("SELECT * FROM %s WHERE col1 = 1 AND col2 = 10", tableName), "VALUES (1, 10, 100)");
    assertFilterPushdown(tableName, ImmutableMap.of("col1", singleValue(BIGINT, 1L), "col2", singleValue(BIGINT, 10L)), ImmutableMap.of("col2", singleValue(BIGINT, 10L)), ImmutableMap.of("col1", singleValue(BIGINT, 1L)));
    // Assert pushdown for an IN predicate with value count above the default compaction threshold
    List<Long> values = LongStream.range(1L, 1010L).boxed().filter(index -> index != 20L).collect(toImmutableList());
    assertTrue(values.size() > ICEBERG_DOMAIN_COMPACTION_THRESHOLD);
    String valuesString = join(",", values.stream().map(Object::toString).collect(toImmutableList()));
    String inPredicate = "%s IN (" + valuesString + ")";
    assertQuery(format("SELECT * FROM %s WHERE %s AND %s", tableName, format(inPredicate, "col1"), format(inPredicate, "col2")), "VALUES (1, 10, 100)");
    assertFilterPushdown(tableName, ImmutableMap.of("col1", multipleValues(BIGINT, values), "col2", multipleValues(BIGINT, values)), ImmutableMap.of("col2", multipleValues(BIGINT, values)), // Unenforced predicate is simplified during split generation, but not reflected here
    ImmutableMap.of("col1", multipleValues(BIGINT, values)));
    dropTable(tableName.getObjectName());
}

Also used : SkipException(org.testng.SkipException) FileSystem(org.apache.hadoop.fs.FileSystem) Test(org.testng.annotations.Test) TestTable(io.trino.testing.sql.TestTable) BROADCAST(io.trino.sql.planner.OptimizerConfig.JoinDistributionType.BROADCAST) TableNotFoundException(io.trino.spi.connector.TableNotFoundException) Matcher(java.util.regex.Matcher) Map(java.util.Map) MaterializedRow(io.trino.testing.MaterializedRow) Path(java.nio.file.Path) Assert.assertFalse(org.testng.Assert.assertFalse) Assert.assertEquals(io.trino.testing.assertions.Assert.assertEquals) GenericDatumWriter(org.apache.avro.generic.GenericDatumWriter) Assert.assertNotEquals(org.testng.Assert.assertNotEquals) HdfsEnvironment(io.trino.plugin.hive.HdfsEnvironment) Schema(org.apache.avro.Schema) DataProviders(io.trino.testing.DataProviders) Domain(io.trino.spi.predicate.Domain) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) Set(java.util.Set) DataFileWriter(org.apache.avro.file.DataFileWriter) HDFS_ENVIRONMENT(io.trino.plugin.hive.HiveTestUtils.HDFS_ENVIRONMENT) Collectors.joining(java.util.stream.Collectors.joining) ImmutableMap.toImmutableMap(com.google.common.collect.ImmutableMap.toImmutableMap) TestingSession.testSessionBuilder(io.trino.testing.TestingSession.testSessionBuilder) Stream(java.util.stream.Stream) ORC(io.trino.plugin.iceberg.IcebergFileFormat.ORC) Session(io.trino.Session) NullableValue(io.trino.spi.predicate.NullableValue) GenericData(org.apache.avro.generic.GenericData) VARCHAR(io.trino.spi.type.VarcharType.VARCHAR) String.join(java.lang.String.join) ColumnHandle(io.trino.spi.connector.ColumnHandle) ConstraintApplicationResult(io.trino.spi.connector.ConstraintApplicationResult) LongStream(java.util.stream.LongStream) OperatorStats(io.trino.operator.OperatorStats) Language(org.intellij.lang.annotations.Language) Files(java.nio.file.Files) IOException(java.io.IOException) Iterables.getOnlyElement(com.google.common.collect.Iterables.getOnlyElement) File(java.io.File) BaseConnectorTest(io.trino.testing.BaseConnectorTest) DOUBLE(io.trino.spi.type.DoubleType.DOUBLE) TableHandle(io.trino.metadata.TableHandle) QualifiedObjectName(io.trino.metadata.QualifiedObjectName) Paths(java.nio.file.Paths) QueryRunner(io.trino.testing.QueryRunner) Domain.multipleValues(io.trino.spi.predicate.Domain.multipleValues) QueryId(io.trino.spi.QueryId) TransactionBuilder.transaction(io.trino.transaction.TransactionBuilder.transaction) IntStream.range(java.util.stream.IntStream.range) MaterializedResult(io.trino.testing.MaterializedResult) Assertions.assertThat(org.assertj.core.api.Assertions.assertThat) PREFERRED_WRITE_PARTITIONING_MIN_NUMBER_OF_PARTITIONS(io.trino.SystemSessionProperties.PREFERRED_WRITE_PARTITIONING_MIN_NUMBER_OF_PARTITIONS) MoreCollectors.onlyElement(com.google.common.collect.MoreCollectors.onlyElement) ICEBERG_DOMAIN_COMPACTION_THRESHOLD(io.trino.plugin.iceberg.IcebergSplitManager.ICEBERG_DOMAIN_COMPACTION_THRESHOLD) Preconditions.checkArgument(com.google.common.base.Preconditions.checkArgument) QueryAssertions.assertEqualsIgnoreOrder(io.trino.testing.QueryAssertions.assertEqualsIgnoreOrder) Locale(java.util.Locale) Iterables.concat(com.google.common.collect.Iterables.concat) TestingConnectorBehavior(io.trino.testing.TestingConnectorBehavior) TestTable.randomTableSuffix(io.trino.testing.sql.TestTable.randomTableSuffix) TpchTable(io.trino.tpch.TpchTable) ImmutableSet(com.google.common.collect.ImmutableSet) ImmutableMap(com.google.common.collect.ImmutableMap) Predicate(java.util.function.Predicate) Collections.nCopies(java.util.Collections.nCopies) Collectors(java.util.stream.Collectors) String.format(java.lang.String.format) ICEBERG_CATALOG(io.trino.plugin.iceberg.IcebergQueryRunner.ICEBERG_CATALOG) DataSize(io.airlift.units.DataSize) List(java.util.List) HdfsContext(io.trino.plugin.hive.HdfsEnvironment.HdfsContext) BIGINT(io.trino.spi.type.BigintType.BIGINT) Domain.singleValue(io.trino.spi.predicate.Domain.singleValue) Optional(java.util.Optional) Pattern(java.util.regex.Pattern) DataFileReader(org.apache.avro.file.DataFileReader) GenericDatumReader(org.apache.avro.generic.GenericDatumReader) IcebergQueryRunner.createIcebergQueryRunner(io.trino.plugin.iceberg.IcebergQueryRunner.createIcebergQueryRunner) LINE_ITEM(io.trino.tpch.TpchTable.LINE_ITEM) IntStream(java.util.stream.IntStream) Constraint(io.trino.spi.connector.Constraint) DataProvider(org.testng.annotations.DataProvider) PARQUET(io.trino.plugin.iceberg.IcebergFileFormat.PARQUET) ImmutableList(com.google.common.collect.ImmutableList) Assertions.assertThatThrownBy(org.assertj.core.api.Assertions.assertThatThrownBy) Objects.requireNonNull(java.util.Objects.requireNonNull) TableStatistics(io.trino.spi.statistics.TableStatistics) NoSuchElementException(java.util.NoSuchElementException) VerifyException(com.google.common.base.VerifyException) OutputStream(java.io.OutputStream) ResultWithQueryId(io.trino.testing.ResultWithQueryId) TupleDomain(io.trino.spi.predicate.TupleDomain) Consumer(java.util.function.Consumer) Assert.assertEventually(io.trino.testing.assertions.Assert.assertEventually) JOIN_DISTRIBUTION_TYPE(io.trino.SystemSessionProperties.JOIN_DISTRIBUTION_TYPE) Metadata(io.trino.metadata.Metadata) Assert.assertTrue(org.testng.Assert.assertTrue) MaterializedResult.resultBuilder(io.trino.testing.MaterializedResult.resultBuilder) QualifiedObjectName(io.trino.metadata.QualifiedObjectName) Test(org.testng.annotations.Test) BaseConnectorTest(io.trino.testing.BaseConnectorTest)

Example 2 with ICEBERG_DOMAIN_COMPACTION_THRESHOLD

use of io.trino.plugin.iceberg.IcebergSplitManager.ICEBERG_DOMAIN_COMPACTION_THRESHOLD in project trino by trinodb.

the class IcebergPageSourceProvider method createPageSource.

@Override
public ConnectorPageSource createPageSource(ConnectorTransactionHandle transaction, ConnectorSession session, ConnectorSplit connectorSplit, ConnectorTableHandle connectorTable, List<ColumnHandle> columns, DynamicFilter dynamicFilter) {
    IcebergSplit split = (IcebergSplit) connectorSplit;
    IcebergTableHandle table = (IcebergTableHandle) connectorTable;
    List<IcebergColumnHandle> icebergColumns = columns.stream().map(IcebergColumnHandle.class::cast).collect(toImmutableList());
    Map<Integer, Optional<String>> partitionKeys = split.getPartitionKeys();
    List<IcebergColumnHandle> regularColumns = columns.stream().map(IcebergColumnHandle.class::cast).filter(column -> !partitionKeys.containsKey(column.getId())).collect(toImmutableList());
    TupleDomain<IcebergColumnHandle> effectivePredicate = table.getUnenforcedPredicate().intersect(dynamicFilter.getCurrentPredicate().transformKeys(IcebergColumnHandle.class::cast)).simplify(ICEBERG_DOMAIN_COMPACTION_THRESHOLD);
    HdfsContext hdfsContext = new HdfsContext(session);
    ReaderPageSource dataPageSource = createDataPageSource(session, hdfsContext, new Path(split.getPath()), split.getStart(), split.getLength(), split.getFileSize(), split.getFileFormat(), regularColumns, effectivePredicate, table.getNameMappingJson().map(NameMappingParser::fromJson));
    Optional<ReaderProjectionsAdapter> projectionsAdapter = dataPageSource.getReaderColumns().map(readerColumns -> new ReaderProjectionsAdapter(regularColumns, readerColumns, column -> ((IcebergColumnHandle) column).getType(), IcebergPageSourceProvider::applyProjection));
    return new IcebergPageSource(icebergColumns, partitionKeys, dataPageSource.get(), projectionsAdapter);
}

Also used : FileSystem(org.apache.hadoop.fs.FileSystem) Maps.uniqueIndex(com.google.common.collect.Maps.uniqueIndex) ORC_ICEBERG_ID_KEY(io.trino.plugin.iceberg.TypeConverter.ORC_ICEBERG_ID_KEY) ColumnAdaptation(io.trino.plugin.hive.orc.OrcPageSource.ColumnAdaptation) FileStatus(org.apache.hadoop.fs.FileStatus) BlockMissingException(org.apache.hadoop.hdfs.BlockMissingException) ParquetDataSourceId(io.trino.parquet.ParquetDataSourceId) NOT_SUPPORTED(io.trino.spi.StandardErrorCode.NOT_SUPPORTED) ConnectorTableHandle(io.trino.spi.connector.ConnectorTableHandle) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) ConnectorPageSource(io.trino.spi.connector.ConnectorPageSource) ICEBERG_CANNOT_OPEN_SPLIT(io.trino.plugin.iceberg.IcebergErrorCode.ICEBERG_CANNOT_OPEN_SPLIT) UUID(io.trino.spi.type.UuidType.UUID) ENGLISH(java.util.Locale.ENGLISH) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) FileFormatDataSourceStats(io.trino.plugin.hive.FileFormatDataSourceStats) HdfsEnvironment(io.trino.plugin.hive.HdfsEnvironment) ConnectorIdentity(io.trino.spi.security.ConnectorIdentity) ICEBERG_FILESYSTEM_ERROR(io.trino.plugin.iceberg.IcebergErrorCode.ICEBERG_FILESYSTEM_ERROR) Domain(io.trino.spi.predicate.Domain) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) ReaderColumns(io.trino.plugin.hive.ReaderColumns) Set(java.util.Set) ReaderPageSource(io.trino.plugin.hive.ReaderPageSource) ImmutableMap.toImmutableMap(com.google.common.collect.ImmutableMap.toImmutableMap) OrcReaderConfig(io.trino.plugin.hive.orc.OrcReaderConfig) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) ColumnIO(org.apache.parquet.io.ColumnIO) IcebergSessionProperties.getOrcTinyStripeThreshold(io.trino.plugin.iceberg.IcebergSessionProperties.getOrcTinyStripeThreshold) ParquetReaderConfig(io.trino.plugin.hive.parquet.ParquetReaderConfig) ParquetCorruptionException(io.trino.parquet.ParquetCorruptionException) MappedField(org.apache.iceberg.mapping.MappedField) Collectors.groupingBy(java.util.stream.Collectors.groupingBy) IcebergSessionProperties.isOrcNestedLazy(io.trino.plugin.iceberg.IcebergSessionProperties.isOrcNestedLazy) IcebergSessionProperties.getOrcMaxBufferSize(io.trino.plugin.iceberg.IcebergSessionProperties.getOrcMaxBufferSize) HdfsParquetDataSource(io.trino.plugin.hive.parquet.HdfsParquetDataSource) ArrayList(java.util.ArrayList) Collectors.toUnmodifiableList(java.util.stream.Collectors.toUnmodifiableList) TupleDomainOrcPredicateBuilder(io.trino.orc.TupleDomainOrcPredicate.TupleDomainOrcPredicateBuilder) OrcPageSource(io.trino.plugin.hive.orc.OrcPageSource) IcebergSessionProperties.getOrcMaxMergeDistance(io.trino.plugin.iceberg.IcebergSessionProperties.getOrcMaxMergeDistance) ICEBERG_MISSING_DATA(io.trino.plugin.iceberg.IcebergErrorCode.ICEBERG_MISSING_DATA) AggregatedMemoryContext(io.trino.memory.context.AggregatedMemoryContext) ColumnHandle(io.trino.spi.connector.ColumnHandle) ImmutableSet.toImmutableSet(com.google.common.collect.ImmutableSet.toImmutableSet) VARBINARY(io.trino.spi.type.VarbinaryType.VARBINARY) MappedFields(org.apache.iceberg.mapping.MappedFields) RichColumnDescriptor(io.trino.parquet.RichColumnDescriptor) OrcType(io.trino.orc.metadata.OrcType) Predicate(io.trino.parquet.predicate.Predicate) IcebergSessionProperties.isUseFileSizeFromMetadata(io.trino.plugin.iceberg.IcebergSessionProperties.isUseFileSizeFromMetadata) MapType(io.trino.spi.type.MapType) PredicateUtils.predicateMatches(io.trino.parquet.predicate.PredicateUtils.predicateMatches) ConnectorSplit(io.trino.spi.connector.ConnectorSplit) StandardTypes(io.trino.spi.type.StandardTypes) NameMappingParser(org.apache.iceberg.mapping.NameMappingParser) IcebergSessionProperties.getOrcLazyReadSmallRanges(io.trino.plugin.iceberg.IcebergSessionProperties.getOrcLazyReadSmallRanges) IcebergSessionProperties.getParquetMaxReadBlockSize(io.trino.plugin.iceberg.IcebergSessionProperties.getParquetMaxReadBlockSize) IOException(java.io.IOException) ConnectorSession(io.trino.spi.connector.ConnectorSession) UTC(org.joda.time.DateTimeZone.UTC) Field(io.trino.parquet.Field) Traverser(com.google.common.graph.Traverser) ParquetPageSource(io.trino.plugin.hive.parquet.ParquetPageSource) ProjectedLayout(io.trino.orc.OrcReader.ProjectedLayout) FileMetaData(org.apache.parquet.hadoop.metadata.FileMetaData) IcebergSessionProperties.getOrcStreamBufferSize(io.trino.plugin.iceberg.IcebergSessionProperties.getOrcStreamBufferSize) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) ParquetSchemaUtil(org.apache.iceberg.parquet.ParquetSchemaUtil) OrcColumn(io.trino.orc.OrcColumn) PredicateUtils.buildPredicate(io.trino.parquet.predicate.PredicateUtils.buildPredicate) MetadataReader(io.trino.parquet.reader.MetadataReader) ICEBERG_DOMAIN_COMPACTION_THRESHOLD(io.trino.plugin.iceberg.IcebergSplitManager.ICEBERG_DOMAIN_COMPACTION_THRESHOLD) OrcRecordReader(io.trino.orc.OrcRecordReader) NameMapping(org.apache.iceberg.mapping.NameMapping) Path(org.apache.hadoop.fs.Path) OrcDataSource(io.trino.orc.OrcDataSource) ReaderProjectionsAdapter(io.trino.plugin.hive.ReaderProjectionsAdapter) RowType(io.trino.spi.type.RowType) ICEBERG_BAD_DATA(io.trino.plugin.iceberg.IcebergErrorCode.ICEBERG_BAD_DATA) ImmutableMap(com.google.common.collect.ImmutableMap) INITIAL_BATCH_SIZE(io.trino.orc.OrcReader.INITIAL_BATCH_SIZE) ParquetReader(io.trino.parquet.reader.ParquetReader) FieldContext(io.trino.plugin.iceberg.IcebergParquetColumnIOConverter.FieldContext) ICEBERG_CURSOR_ERROR(io.trino.plugin.iceberg.IcebergErrorCode.ICEBERG_CURSOR_ERROR) TrinoException(io.trino.spi.TrinoException) ArrayType(io.trino.spi.type.ArrayType) ParquetTypeUtils.getColumnIO(io.trino.parquet.ParquetTypeUtils.getColumnIO) Collectors(java.util.stream.Collectors) String.format(java.lang.String.format) Preconditions.checkState(com.google.common.base.Preconditions.checkState) Objects(java.util.Objects) OrcDataSourceId(io.trino.orc.OrcDataSourceId) MessageType(org.apache.parquet.schema.MessageType) HdfsContext(io.trino.plugin.hive.HdfsEnvironment.HdfsContext) List(java.util.List) DynamicFilter(io.trino.spi.connector.DynamicFilter) Optional(java.util.Optional) IcebergSessionProperties.getOrcMaxReadBlockSize(io.trino.plugin.iceberg.IcebergSessionProperties.getOrcMaxReadBlockSize) MessageColumnIO(org.apache.parquet.io.MessageColumnIO) AggregatedMemoryContext.newSimpleAggregatedMemoryContext(io.trino.memory.context.AggregatedMemoryContext.newSimpleAggregatedMemoryContext) Type(io.trino.spi.type.Type) HashMap(java.util.HashMap) TupleDomainOrcPredicate(io.trino.orc.TupleDomainOrcPredicate) Function(java.util.function.Function) Inject(javax.inject.Inject) ImmutableList(com.google.common.collect.ImmutableList) Verify.verify(com.google.common.base.Verify.verify) OrcReaderOptions(io.trino.orc.OrcReaderOptions) Objects.requireNonNull(java.util.Objects.requireNonNull) Collectors.mapping(java.util.stream.Collectors.mapping) IcebergSessionProperties.isOrcBloomFiltersEnabled(io.trino.plugin.iceberg.IcebergSessionProperties.isOrcBloomFiltersEnabled) HdfsOrcDataSource(io.trino.plugin.hive.orc.HdfsOrcDataSource) ParquetReaderOptions(io.trino.parquet.ParquetReaderOptions) OrcReader(io.trino.orc.OrcReader) ConnectorPageSourceProvider(io.trino.spi.connector.ConnectorPageSourceProvider) ICEBERG_BINARY_TYPE(io.trino.plugin.iceberg.TypeConverter.ICEBERG_BINARY_TYPE) TupleDomain(io.trino.spi.predicate.TupleDomain) OrcReader.fullyProjectedLayout(io.trino.orc.OrcReader.fullyProjectedLayout) OrcCorruptionException(io.trino.orc.OrcCorruptionException) Collectors.toList(java.util.stream.Collectors.toList) ParquetTypeUtils.getDescriptors(io.trino.parquet.ParquetTypeUtils.getDescriptors) ParquetDataSource(io.trino.parquet.ParquetDataSource) TypeManager(io.trino.spi.type.TypeManager) ConnectorTransactionHandle(io.trino.spi.connector.ConnectorTransactionHandle) Path(org.apache.hadoop.fs.Path) ReaderProjectionsAdapter(io.trino.plugin.hive.ReaderProjectionsAdapter) Optional(java.util.Optional) ReaderPageSource(io.trino.plugin.hive.ReaderPageSource) HdfsContext(io.trino.plugin.hive.HdfsEnvironment.HdfsContext)

Example 3 with ICEBERG_DOMAIN_COMPACTION_THRESHOLD

use of io.trino.plugin.iceberg.IcebergSplitManager.ICEBERG_DOMAIN_COMPACTION_THRESHOLD in project trino by trinodb.

the class IcebergSplitSource method getNextBatch.

@Override
public CompletableFuture<ConnectorSplitBatch> getNextBatch(ConnectorPartitionHandle partitionHandle, int maxSize) {
    long timeLeft = dynamicFilteringWaitTimeoutMillis - dynamicFilterWaitStopwatch.elapsed(MILLISECONDS);
    if (dynamicFilter.isAwaitable() && timeLeft > 0) {
        return dynamicFilter.isBlocked().thenApply(ignored -> EMPTY_BATCH).completeOnTimeout(EMPTY_BATCH, timeLeft, MILLISECONDS);
    }
    if (combinedScanIterable == null) {
        // Used to avoid duplicating work if the Dynamic Filter was already pushed down to the Iceberg API
        this.pushedDownDynamicFilterPredicate = dynamicFilter.getCurrentPredicate().transformKeys(IcebergColumnHandle.class::cast);
        TupleDomain<IcebergColumnHandle> fullPredicate = tableHandle.getUnenforcedPredicate().intersect(pushedDownDynamicFilterPredicate);
        // TODO: (https://github.com/trinodb/trino/issues/9743): Consider removing TupleDomain#simplify
        TupleDomain<IcebergColumnHandle> simplifiedPredicate = fullPredicate.simplify(ICEBERG_DOMAIN_COMPACTION_THRESHOLD);
        if (!simplifiedPredicate.equals(fullPredicate)) {
            // Pushed down predicate was simplified, always evaluate it against individual splits
            this.pushedDownDynamicFilterPredicate = TupleDomain.all();
        }
        TupleDomain<IcebergColumnHandle> effectivePredicate = tableHandle.getEnforcedPredicate().intersect(simplifiedPredicate);
        if (effectivePredicate.isNone()) {
            finish();
            return completedFuture(NO_MORE_SPLITS_BATCH);
        }
        Expression filterExpression = toIcebergExpression(effectivePredicate);
        this.combinedScanIterable = tableScan.filter(filterExpression).includeColumnStats().planTasks();
        this.fileScanIterator = Streams.stream(combinedScanIterable).map(CombinedScanTask::files).flatMap(Collection::stream).iterator();
    }
    TupleDomain<IcebergColumnHandle> dynamicFilterPredicate = dynamicFilter.getCurrentPredicate().transformKeys(IcebergColumnHandle.class::cast);
    if (dynamicFilterPredicate.isNone()) {
        finish();
        return completedFuture(NO_MORE_SPLITS_BATCH);
    }
    Iterator<FileScanTask> fileScanTasks = Iterators.limit(fileScanIterator, maxSize);
    ImmutableList.Builder<ConnectorSplit> splits = ImmutableList.builder();
    while (fileScanTasks.hasNext()) {
        FileScanTask scanTask = fileScanTasks.next();
        if (!scanTask.deletes().isEmpty()) {
            throw new TrinoException(NOT_SUPPORTED, "Iceberg tables with delete files are not supported: " + tableHandle.getSchemaTableName());
        }
        if (maxScannedFileSizeInBytes.isPresent() && scanTask.file().fileSizeInBytes() > maxScannedFileSizeInBytes.get()) {
            continue;
        }
        IcebergSplit icebergSplit = toIcebergSplit(scanTask);
        Schema fileSchema = scanTask.spec().schema();
        Set<IcebergColumnHandle> identityPartitionColumns = icebergSplit.getPartitionKeys().keySet().stream().map(fieldId -> getColumnHandle(fileSchema.findField(fieldId), typeManager)).collect(toImmutableSet());
        Supplier<Map<ColumnHandle, NullableValue>> partitionValues = memoize(() -> {
            Map<ColumnHandle, NullableValue> bindings = new HashMap<>();
            for (IcebergColumnHandle partitionColumn : identityPartitionColumns) {
                Object partitionValue = deserializePartitionValue(partitionColumn.getType(), icebergSplit.getPartitionKeys().get(partitionColumn.getId()).orElse(null), partitionColumn.getName());
                NullableValue bindingValue = new NullableValue(partitionColumn.getType(), partitionValue);
                bindings.put(partitionColumn, bindingValue);
            }
            return bindings;
        });
        if (!dynamicFilterPredicate.isAll() && !dynamicFilterPredicate.equals(pushedDownDynamicFilterPredicate)) {
            if (!partitionMatchesPredicate(identityPartitionColumns, partitionValues, dynamicFilterPredicate)) {
                continue;
            }
            if (!fileMatchesPredicate(fieldIdToType, dynamicFilterPredicate, scanTask.file().lowerBounds(), scanTask.file().upperBounds(), scanTask.file().nullValueCounts())) {
                continue;
            }
        }
        if (!partitionMatchesConstraint(identityPartitionColumns, partitionValues, constraint)) {
            continue;
        }
        if (recordScannedFiles) {
            scannedFiles.add(scanTask.file());
        }
        splits.add(icebergSplit);
    }
    return completedFuture(new ConnectorSplitBatch(splits.build(), isFinished()));
}

Also used : IcebergUtil.getPartitionKeys(io.trino.plugin.iceberg.IcebergUtil.getPartitionKeys) CompletableFuture.completedFuture(java.util.concurrent.CompletableFuture.completedFuture) ByteBuffer(java.nio.ByteBuffer) TypeConverter.toIcebergType(io.trino.plugin.iceberg.TypeConverter.toIcebergType) Duration(io.airlift.units.Duration) ICEBERG_DOMAIN_COMPACTION_THRESHOLD(io.trino.plugin.iceberg.IcebergSplitManager.ICEBERG_DOMAIN_COMPACTION_THRESHOLD) NOT_SUPPORTED(io.trino.spi.StandardErrorCode.NOT_SUPPORTED) Expression(org.apache.iceberg.expressions.Expression) ConnectorPartitionHandle(io.trino.spi.connector.ConnectorPartitionHandle) Map(java.util.Map) FileScanTask(org.apache.iceberg.FileScanTask) DataFile(org.apache.iceberg.DataFile) IcebergUtil.getColumnHandle(io.trino.plugin.iceberg.IcebergUtil.getColumnHandle) ImmutableSet(com.google.common.collect.ImmutableSet) CloseableIterable(org.apache.iceberg.io.CloseableIterable) Range(io.trino.spi.predicate.Range) Domain(io.trino.spi.predicate.Domain) Collection(java.util.Collection) Set(java.util.Set) TrinoException(io.trino.spi.TrinoException) TableScan(org.apache.iceberg.TableScan) MILLISECONDS(java.util.concurrent.TimeUnit.MILLISECONDS) Streams(com.google.common.collect.Streams) Schema(org.apache.iceberg.Schema) CombinedScanTask(org.apache.iceberg.CombinedScanTask) ValueSet(io.trino.spi.predicate.ValueSet) Preconditions.checkState(com.google.common.base.Preconditions.checkState) Type(org.apache.iceberg.types.Type) UncheckedIOException(java.io.UncheckedIOException) DataSize(io.airlift.units.DataSize) List(java.util.List) DynamicFilter(io.trino.spi.connector.DynamicFilter) Optional(java.util.Optional) Constraint(io.trino.spi.connector.Constraint) IcebergUtil.deserializePartitionValue(io.trino.plugin.iceberg.IcebergUtil.deserializePartitionValue) NullableValue(io.trino.spi.predicate.NullableValue) Stopwatch(com.google.common.base.Stopwatch) HashMap(java.util.HashMap) CompletableFuture(java.util.concurrent.CompletableFuture) Supplier(java.util.function.Supplier) ExpressionConverter.toIcebergExpression(io.trino.plugin.iceberg.ExpressionConverter.toIcebergExpression) Iterators(com.google.common.collect.Iterators) IcebergTypes.convertIcebergValueToTrino(io.trino.plugin.iceberg.IcebergTypes.convertIcebergValueToTrino) ImmutableList(com.google.common.collect.ImmutableList) Verify.verify(com.google.common.base.Verify.verify) Objects.requireNonNull(java.util.Objects.requireNonNull) ColumnHandle(io.trino.spi.connector.ColumnHandle) ImmutableSet.toImmutableSet(com.google.common.collect.ImmutableSet.toImmutableSet) Suppliers.memoize(com.google.common.base.Suppliers.memoize) Nullable(javax.annotation.Nullable) Iterator(java.util.Iterator) ConnectorSplit(io.trino.spi.connector.ConnectorSplit) ConnectorSplitSource(io.trino.spi.connector.ConnectorSplitSource) IOException(java.io.IOException) TupleDomain(io.trino.spi.predicate.TupleDomain) Conversions.fromByteBuffer(org.apache.iceberg.types.Conversions.fromByteBuffer) Sets.intersection(com.google.common.collect.Sets.intersection) IcebergUtil.primitiveFieldTypes(io.trino.plugin.iceberg.IcebergUtil.primitiveFieldTypes) VisibleForTesting(com.google.common.annotations.VisibleForTesting) TypeManager(io.trino.spi.type.TypeManager) Collections(java.util.Collections) IcebergUtil.getColumnHandle(io.trino.plugin.iceberg.IcebergUtil.getColumnHandle) ColumnHandle(io.trino.spi.connector.ColumnHandle) HashMap(java.util.HashMap) ImmutableList(com.google.common.collect.ImmutableList) Schema(org.apache.iceberg.Schema) NullableValue(io.trino.spi.predicate.NullableValue) Expression(org.apache.iceberg.expressions.Expression) ExpressionConverter.toIcebergExpression(io.trino.plugin.iceberg.ExpressionConverter.toIcebergExpression) Collection(java.util.Collection) TrinoException(io.trino.spi.TrinoException) FileScanTask(org.apache.iceberg.FileScanTask) ConnectorSplit(io.trino.spi.connector.ConnectorSplit) Map(java.util.Map) HashMap(java.util.HashMap)

Aggregations

ImmutableList (com.google.common.collect.ImmutableList)3 ICEBERG_DOMAIN_COMPACTION_THRESHOLD (io.trino.plugin.iceberg.IcebergSplitManager.ICEBERG_DOMAIN_COMPACTION_THRESHOLD)3 ColumnHandle (io.trino.spi.connector.ColumnHandle)3 Domain (io.trino.spi.predicate.Domain)3 TupleDomain (io.trino.spi.predicate.TupleDomain)3 IOException (java.io.IOException)3 Preconditions.checkState (com.google.common.base.Preconditions.checkState)2 Verify.verify (com.google.common.base.Verify.verify)2 ImmutableList.toImmutableList (com.google.common.collect.ImmutableList.toImmutableList)2 ImmutableMap (com.google.common.collect.ImmutableMap)2 ImmutableMap.toImmutableMap (com.google.common.collect.ImmutableMap.toImmutableMap)2 ImmutableSet (com.google.common.collect.ImmutableSet)2 ImmutableSet.toImmutableSet (com.google.common.collect.ImmutableSet.toImmutableSet)2 DataSize (io.airlift.units.DataSize)2 HdfsEnvironment (io.trino.plugin.hive.HdfsEnvironment)2 HdfsContext (io.trino.plugin.hive.HdfsEnvironment.HdfsContext)2 String.format (java.lang.String.format)2 List (java.util.List)2 Map (java.util.Map)2 Objects.requireNonNull (java.util.Objects.requireNonNull)2