Search in sources :

Example 1 with REGULAR

use of io.trino.plugin.deltalake.DeltaLakeColumnType.REGULAR in project trino by trinodb.

the class TestDeltaLakeSchemaSupport method testRoundTripComplexSchema.

@Test
public void testRoundTripComplexSchema() throws IOException, URISyntaxException {
    URL expected = getResource("io/trino/plugin/deltalake/transactionlog/schema/complex_schema.json");
    String json = Files.readString(Path.of(expected.toURI()));
    List<ColumnMetadata> schema = DeltaLakeSchemaSupport.getColumnMetadata(json, typeManager);
    List<DeltaLakeColumnHandle> columnHandles = schema.stream().map(metadata -> new DeltaLakeColumnHandle(metadata.getName(), metadata.getType(), REGULAR)).collect(toImmutableList());
    ObjectMapper objectMapper = new ObjectMapper();
    assertEquals(objectMapper.readTree(serializeSchemaAsJson(columnHandles)), objectMapper.readTree(json));
}
Also used : DataProvider(org.testng.annotations.DataProvider) AssertionsForClassTypes.assertThatCode(org.assertj.core.api.AssertionsForClassTypes.assertThatCode) INTERVAL_YEAR_MONTH(io.trino.type.IntervalYearMonthType.INTERVAL_YEAR_MONTH) ColumnMetadata(io.trino.spi.connector.ColumnMetadata) URL(java.net.URL) DeltaLakeSchemaSupport.serializeSchemaAsJson(io.trino.plugin.deltalake.transactionlog.DeltaLakeSchemaSupport.serializeSchemaAsJson) TIMESTAMP_MILLIS(io.trino.spi.type.TimestampType.TIMESTAMP_MILLIS) URISyntaxException(java.net.URISyntaxException) Type(io.trino.spi.type.Type) BOOLEAN(io.trino.spi.type.BooleanType.BOOLEAN) Test(org.testng.annotations.Test) TypeOperators(io.trino.spi.type.TypeOperators) VarcharType(io.trino.spi.type.VarcharType) TIMESTAMP_TZ_MILLIS(io.trino.spi.type.TimestampWithTimeZoneType.TIMESTAMP_TZ_MILLIS) VARCHAR(io.trino.spi.type.VarcharType.VARCHAR) ImmutableList(com.google.common.collect.ImmutableList) TestingComplexTypeManager(io.trino.plugin.deltalake.TestingComplexTypeManager) DeltaLakeColumnHandle(io.trino.plugin.deltalake.DeltaLakeColumnHandle) VARBINARY(io.trino.spi.type.VarbinaryType.VARBINARY) TIMESTAMP_WITH_TIME_ZONE(io.trino.spi.type.TimestampWithTimeZoneType.TIMESTAMP_WITH_TIME_ZONE) INTEGER(io.trino.spi.type.IntegerType.INTEGER) INTERVAL_DAY_TIME(io.trino.type.IntervalDayTimeType.INTERVAL_DAY_TIME) Path(java.nio.file.Path) DeltaLakeSchemaSupport.serializeStatsAsJson(io.trino.plugin.deltalake.transactionlog.DeltaLakeSchemaSupport.serializeStatsAsJson) SMALLINT(io.trino.spi.type.SmallintType.SMALLINT) TIMESTAMP_SECONDS(io.trino.spi.type.TimestampType.TIMESTAMP_SECONDS) Assert.assertEquals(io.trino.testing.assertions.Assert.assertEquals) RowType(io.trino.spi.type.RowType) ImmutableMap(com.google.common.collect.ImmutableMap) Files(java.nio.file.Files) MapType(io.trino.spi.type.MapType) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) ObjectMapper(com.fasterxml.jackson.databind.ObjectMapper) JsonProcessingException(com.fasterxml.jackson.core.JsonProcessingException) ArrayType(io.trino.spi.type.ArrayType) IOException(java.io.IOException) DOUBLE(io.trino.spi.type.DoubleType.DOUBLE) Resources.getResource(com.google.common.io.Resources.getResource) List(java.util.List) DeltaLakeJsonFileStatistics(io.trino.plugin.deltalake.transactionlog.statistics.DeltaLakeJsonFileStatistics) BIGINT(io.trino.spi.type.BigintType.BIGINT) CharType(io.trino.spi.type.CharType) Optional(java.util.Optional) TINYINT(io.trino.spi.type.TinyintType.TINYINT) DecimalType(io.trino.spi.type.DecimalType) REGULAR(io.trino.plugin.deltalake.DeltaLakeColumnType.REGULAR) DATE(io.trino.spi.type.DateType.DATE) REAL(io.trino.spi.type.RealType.REAL) ColumnMetadata(io.trino.spi.connector.ColumnMetadata) DeltaLakeColumnHandle(io.trino.plugin.deltalake.DeltaLakeColumnHandle) URL(java.net.URL) ObjectMapper(com.fasterxml.jackson.databind.ObjectMapper) Test(org.testng.annotations.Test)

Example 2 with REGULAR

use of io.trino.plugin.deltalake.DeltaLakeColumnType.REGULAR in project trino by trinodb.

the class TestTransactionLogAccess method testSnapshotsAreConsistent.

@Test
public void testSnapshotsAreConsistent() throws Exception {
    String tableName = "person";
    File tempDir = Files.createTempDir();
    File tableDir = new File(tempDir, tableName);
    File transactionLogDir = new File(tableDir, TRANSACTION_LOG_DIRECTORY);
    transactionLogDir.mkdirs();
    File resourceDir = new File(getClass().getClassLoader().getResource("databricks/person/_delta_log").toURI());
    copyTransactionLogEntry(0, 12, resourceDir, transactionLogDir);
    Files.copy(new File(resourceDir, LAST_CHECKPOINT_FILENAME), new File(transactionLogDir, LAST_CHECKPOINT_FILENAME));
    setupTransactionLogAccess(tableName, new Path(tableDir.toURI()));
    List<AddFileEntry> expectedDataFiles = transactionLogAccess.getActiveFiles(tableSnapshot, SESSION);
    copyTransactionLogEntry(12, 14, resourceDir, transactionLogDir);
    Set<String> newDataFiles = ImmutableSet.of("age=28/part-00000-40dd1707-1d42-4328-a59a-21f5c945fe60.c000.snappy.parquet", "age=29/part-00000-3794c463-cb0c-4beb-8d07-7cc1e3b5920f.c000.snappy.parquet");
    TableSnapshot updatedTableSnapshot = transactionLogAccess.loadSnapshot(new SchemaTableName("schema", tableName), new Path(tableDir.toURI()), SESSION);
    List<AddFileEntry> allDataFiles = transactionLogAccess.getActiveFiles(updatedTableSnapshot, SESSION);
    List<AddFileEntry> dataFilesWithFixedVersion = transactionLogAccess.getActiveFiles(tableSnapshot, SESSION);
    for (String newFilePath : newDataFiles) {
        assertTrue(allDataFiles.stream().anyMatch(entry -> entry.getPath().equals(newFilePath)));
        assertTrue(dataFilesWithFixedVersion.stream().noneMatch(entry -> entry.getPath().equals(newFilePath)));
    }
    assertEquals(expectedDataFiles.size(), dataFilesWithFixedVersion.size());
    List<ColumnMetadata> columns = extractSchema(transactionLogAccess.getMetadataEntry(tableSnapshot, SESSION).get(), TESTING_TYPE_MANAGER);
    for (int i = 0; i < expectedDataFiles.size(); i++) {
        AddFileEntry expected = expectedDataFiles.get(i);
        AddFileEntry actual = dataFilesWithFixedVersion.get(i);
        assertEquals(expected.getPath(), actual.getPath());
        assertEquals(expected.getPartitionValues(), actual.getPartitionValues());
        assertEquals(expected.getSize(), actual.getSize());
        assertEquals(expected.getModificationTime(), actual.getModificationTime());
        assertEquals(expected.isDataChange(), actual.isDataChange());
        assertEquals(expected.getTags(), actual.getTags());
        assertTrue(expected.getStats().isPresent());
        assertTrue(actual.getStats().isPresent());
        for (ColumnMetadata column : columns) {
            DeltaLakeColumnHandle columnHandle = new DeltaLakeColumnHandle(column.getName(), column.getType(), REGULAR);
            assertEquals(expected.getStats().get().getMinColumnValue(columnHandle), actual.getStats().get().getMinColumnValue(columnHandle));
            assertEquals(expected.getStats().get().getMaxColumnValue(columnHandle), actual.getStats().get().getMaxColumnValue(columnHandle));
            assertEquals(expected.getStats().get().getNullCount(columnHandle.getName()), actual.getStats().get().getNullCount(columnHandle.getName()));
            assertEquals(expected.getStats().get().getNumRecords(), actual.getStats().get().getNumRecords());
        }
    }
}
Also used : Path(org.apache.hadoop.fs.Path) Assertions.assertThat(org.assertj.core.api.Assertions.assertThat) TableSnapshot(io.trino.plugin.deltalake.transactionlog.TableSnapshot) Test(org.testng.annotations.Test) DeltaLakeSchemaSupport.extractSchema(io.trino.plugin.deltalake.transactionlog.DeltaLakeSchemaSupport.extractSchema) AddFileEntry(io.trino.plugin.deltalake.transactionlog.AddFileEntry) RemoveFileEntry(io.trino.plugin.deltalake.transactionlog.RemoveFileEntry) NoHdfsAuthentication(io.trino.plugin.hive.authentication.NoHdfsAuthentication) BigDecimal(java.math.BigDecimal) CheckpointSchemaManager(io.trino.plugin.deltalake.transactionlog.checkpoint.CheckpointSchemaManager) Map(java.util.Map) Sets.union(com.google.common.collect.Sets.union) Path(org.apache.hadoop.fs.Path) HiveHdfsConfiguration(io.trino.plugin.hive.HiveHdfsConfiguration) Slices.utf8Slice(io.airlift.slice.Slices.utf8Slice) LAST_CHECKPOINT_FILENAME(io.trino.plugin.deltalake.transactionlog.TransactionLogParser.LAST_CHECKPOINT_FILENAME) Assert.assertFalse(org.testng.Assert.assertFalse) Assert.assertEquals(io.trino.testing.assertions.Assert.assertEquals) TRANSACTION_LOG_DIRECTORY(io.trino.plugin.deltalake.transactionlog.TransactionLogUtil.TRANSACTION_LOG_DIRECTORY) ImmutableSet(com.google.common.collect.ImmutableSet) FileFormatDataSourceStats(io.trino.plugin.hive.FileFormatDataSourceStats) HdfsEnvironment(io.trino.plugin.hive.HdfsEnvironment) ImmutableMap(com.google.common.collect.ImmutableMap) UTC_KEY(io.trino.spi.type.TimeZoneKey.UTC_KEY) SESSION(io.trino.testing.TestingConnectorSession.SESSION) MetadataEntry(io.trino.plugin.deltalake.transactionlog.MetadataEntry) Set(java.util.Set) Collectors(java.util.stream.Collectors) SchemaTableName(io.trino.spi.connector.SchemaTableName) String.format(java.lang.String.format) List(java.util.List) Stream(java.util.stream.Stream) HdfsConfig(io.trino.plugin.hive.HdfsConfig) LocalDate(java.time.LocalDate) HdfsConfigurationInitializer(io.trino.plugin.hive.HdfsConfigurationInitializer) Decimals(io.trino.spi.type.Decimals) UTC(java.time.ZoneOffset.UTC) Optional(java.util.Optional) ParquetReaderConfig(io.trino.plugin.hive.parquet.ParquetReaderConfig) REGULAR(io.trino.plugin.deltalake.DeltaLakeColumnType.REGULAR) DeltaLakeFileStatistics(io.trino.plugin.deltalake.transactionlog.statistics.DeltaLakeFileStatistics) DataProvider(org.testng.annotations.DataProvider) ColumnMetadata(io.trino.spi.connector.ColumnMetadata) LocalDateTime(java.time.LocalDateTime) DateTimeEncoding(io.trino.spi.type.DateTimeEncoding) ImmutableList(com.google.common.collect.ImmutableList) Files(com.google.common.io.Files) IntegerType(io.trino.spi.type.IntegerType) TESTING_TYPE_MANAGER(io.trino.type.InternalTypeManager.TESTING_TYPE_MANAGER) IOException(java.io.IOException) HdfsConfiguration(io.trino.plugin.hive.HdfsConfiguration) TupleDomain(io.trino.spi.predicate.TupleDomain) TestingConnectorContext(io.trino.testing.TestingConnectorContext) File(java.io.File) Assertions.assertEqualsIgnoreOrder(io.airlift.testing.Assertions.assertEqualsIgnoreOrder) ProtocolEntry(io.trino.plugin.deltalake.transactionlog.ProtocolEntry) Assert.assertTrue(org.testng.Assert.assertTrue) CommitInfoEntry(io.trino.plugin.deltalake.transactionlog.CommitInfoEntry) TypeManager(io.trino.spi.type.TypeManager) ColumnMetadata(io.trino.spi.connector.ColumnMetadata) SchemaTableName(io.trino.spi.connector.SchemaTableName) AddFileEntry(io.trino.plugin.deltalake.transactionlog.AddFileEntry) TableSnapshot(io.trino.plugin.deltalake.transactionlog.TableSnapshot) File(java.io.File) Test(org.testng.annotations.Test)

Example 3 with REGULAR

use of io.trino.plugin.deltalake.DeltaLakeColumnType.REGULAR in project trino by trinodb.

the class HiveMetastoreBackedDeltaLakeMetastore method getTableStatistics.

@Override
public TableStatistics getTableStatistics(ConnectorSession session, DeltaLakeTableHandle tableHandle, Constraint constraint) {
    TableSnapshot tableSnapshot = getSnapshot(tableHandle.getSchemaTableName(), session);
    double numRecords = 0L;
    MetadataEntry metadata = transactionLogAccess.getMetadataEntry(tableSnapshot, session).orElseThrow(() -> new TrinoException(DELTA_LAKE_INVALID_SCHEMA, "Metadata not found in transaction log for " + tableHandle.getTableName()));
    List<ColumnMetadata> columnMetadata = DeltaLakeSchemaSupport.extractSchema(metadata, typeManager);
    List<DeltaLakeColumnHandle> columns = columnMetadata.stream().map(columnMeta -> new DeltaLakeColumnHandle(columnMeta.getName(), columnMeta.getType(), metadata.getCanonicalPartitionColumns().contains(columnMeta.getName()) ? PARTITION_KEY : REGULAR)).collect(toImmutableList());
    Map<DeltaLakeColumnHandle, Double> nullCounts = new HashMap<>();
    columns.forEach(column -> nullCounts.put(column, 0.0));
    Map<DeltaLakeColumnHandle, Double> minValues = new HashMap<>();
    Map<DeltaLakeColumnHandle, Double> maxValues = new HashMap<>();
    Map<DeltaLakeColumnHandle, Set<String>> partitioningColumnsDistinctValues = new HashMap<>();
    columns.stream().filter(column -> column.getColumnType() == PARTITION_KEY).forEach(column -> partitioningColumnsDistinctValues.put(column, new HashSet<>()));
    if (tableHandle.getEnforcedPartitionConstraint().isNone() || tableHandle.getNonPartitionConstraint().isNone() || constraint.getSummary().isNone()) {
        return createZeroStatistics(columns);
    }
    Set<String> predicatedColumnNames = tableHandle.getNonPartitionConstraint().getDomains().orElseThrow().keySet().stream().map(DeltaLakeColumnHandle::getName).collect(toImmutableSet());
    List<ColumnMetadata> predicatedColumns = columnMetadata.stream().filter(column -> predicatedColumnNames.contains(column.getName())).collect(toImmutableList());
    for (AddFileEntry addEntry : transactionLogAccess.getActiveFiles(tableSnapshot, session)) {
        Optional<? extends DeltaLakeFileStatistics> fileStatistics = addEntry.getStats();
        if (fileStatistics.isEmpty()) {
            // Open source Delta Lake does not collect stats
            return TableStatistics.empty();
        }
        DeltaLakeFileStatistics stats = fileStatistics.get();
        if (!partitionMatchesPredicate(addEntry.getCanonicalPartitionValues(), tableHandle.getEnforcedPartitionConstraint().getDomains().orElseThrow())) {
            continue;
        }
        TupleDomain<DeltaLakeColumnHandle> statisticsPredicate = createStatisticsPredicate(addEntry, predicatedColumns, tableHandle.getMetadataEntry().getCanonicalPartitionColumns());
        if (!tableHandle.getNonPartitionConstraint().overlaps(statisticsPredicate)) {
            continue;
        }
        if (stats.getNumRecords().isEmpty()) {
            // Not clear if it's possible for stats to be present with no row count, but bail out if that happens
            return TableStatistics.empty();
        }
        numRecords += stats.getNumRecords().get();
        for (DeltaLakeColumnHandle column : columns) {
            if (column.getColumnType() == PARTITION_KEY) {
                Optional<String> partitionValue = addEntry.getCanonicalPartitionValues().get(column.getName());
                if (partitionValue.isEmpty()) {
                    nullCounts.merge(column, (double) stats.getNumRecords().get(), Double::sum);
                } else {
                    // NULL is not counted as a distinct value
                    // Code below assumes that values returned by addEntry.getCanonicalPartitionValues() are normalized,
                    // it may not be true in case of real, doubles, timestamps etc
                    partitioningColumnsDistinctValues.get(column).add(partitionValue.get());
                }
            } else {
                Optional<Long> maybeNullCount = stats.getNullCount(column.getName());
                if (maybeNullCount.isPresent()) {
                    nullCounts.put(column, nullCounts.get(column) + maybeNullCount.get());
                } else {
                    // If any individual file fails to report null counts, fail to calculate the total for the table
                    nullCounts.put(column, NaN);
                }
            }
            // Math.min returns NaN if any operand is NaN
            stats.getMinColumnValue(column).map(parsedValue -> toStatsRepresentation(column.getType(), parsedValue)).filter(OptionalDouble::isPresent).map(OptionalDouble::getAsDouble).ifPresent(parsedValueAsDouble -> minValues.merge(column, parsedValueAsDouble, Math::min));
            stats.getMaxColumnValue(column).map(parsedValue -> toStatsRepresentation(column.getType(), parsedValue)).filter(OptionalDouble::isPresent).map(OptionalDouble::getAsDouble).ifPresent(parsedValueAsDouble -> maxValues.merge(column, parsedValueAsDouble, Math::max));
        }
    }
    if (numRecords == 0) {
        return createZeroStatistics(columns);
    }
    TableStatistics.Builder statsBuilder = new TableStatistics.Builder().setRowCount(Estimate.of(numRecords));
    Optional<DeltaLakeStatistics> statistics = Optional.empty();
    if (isExtendedStatisticsEnabled(session)) {
        statistics = statisticsAccess.readDeltaLakeStatistics(session, tableHandle.getLocation());
    }
    for (DeltaLakeColumnHandle column : columns) {
        ColumnStatistics.Builder columnStatsBuilder = new ColumnStatistics.Builder();
        Double nullCount = nullCounts.get(column);
        columnStatsBuilder.setNullsFraction(nullCount.isNaN() ? Estimate.unknown() : Estimate.of(nullCount / numRecords));
        Double maxValue = maxValues.get(column);
        Double minValue = minValues.get(column);
        if (isValidInRange(maxValue) && isValidInRange(minValue)) {
            columnStatsBuilder.setRange(new DoubleRange(minValue, maxValue));
        } else if (isValidInRange(maxValue)) {
            columnStatsBuilder.setRange(new DoubleRange(NEGATIVE_INFINITY, maxValue));
        } else if (isValidInRange(minValue)) {
            columnStatsBuilder.setRange(new DoubleRange(minValue, POSITIVE_INFINITY));
        }
        // extend statistics with NDV
        if (column.getColumnType() == PARTITION_KEY) {
            columnStatsBuilder.setDistinctValuesCount(Estimate.of(partitioningColumnsDistinctValues.get(column).size()));
        }
        if (statistics.isPresent()) {
            DeltaLakeColumnStatistics deltaLakeColumnStatistics = statistics.get().getColumnStatistics().get(column.getName());
            if (deltaLakeColumnStatistics != null && column.getColumnType() != PARTITION_KEY) {
                columnStatsBuilder.setDistinctValuesCount(Estimate.of(deltaLakeColumnStatistics.getNdvSummary().cardinality()));
            }
        }
        statsBuilder.setColumnStatistics(column, columnStatsBuilder.build());
    }
    return statsBuilder.build();
}
Also used : DeltaLakeStatistics(io.trino.plugin.deltalake.statistics.DeltaLakeStatistics) POSITIVE_INFINITY(java.lang.Double.POSITIVE_INFINITY) PATH_PROPERTY(io.trino.plugin.deltalake.DeltaLakeMetadata.PATH_PROPERTY) TableSnapshot(io.trino.plugin.deltalake.transactionlog.TableSnapshot) Database(io.trino.plugin.hive.metastore.Database) NEGATIVE_INFINITY(java.lang.Double.NEGATIVE_INFINITY) AddFileEntry(io.trino.plugin.deltalake.transactionlog.AddFileEntry) TransactionLogAccess(io.trino.plugin.deltalake.transactionlog.TransactionLogAccess) StatsUtil.toStatsRepresentation(io.trino.spi.statistics.StatsUtil.toStatsRepresentation) TableNotFoundException(io.trino.spi.connector.TableNotFoundException) DeltaLakeMetadata.createStatisticsPredicate(io.trino.plugin.deltalake.DeltaLakeMetadata.createStatisticsPredicate) NaN(java.lang.Double.NaN) DeltaLakeColumnHandle(io.trino.plugin.deltalake.DeltaLakeColumnHandle) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) DeltaLakeTableHandle(io.trino.plugin.deltalake.DeltaLakeTableHandle) Table(io.trino.plugin.hive.metastore.Table) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) MetadataEntry(io.trino.plugin.deltalake.transactionlog.MetadataEntry) Set(java.util.Set) DeltaLakeSplitManager.partitionMatchesPredicate(io.trino.plugin.deltalake.DeltaLakeSplitManager.partitionMatchesPredicate) DeltaLakeSchemaSupport(io.trino.plugin.deltalake.transactionlog.DeltaLakeSchemaSupport) TrinoException(io.trino.spi.TrinoException) SchemaTableName(io.trino.spi.connector.SchemaTableName) String.format(java.lang.String.format) List(java.util.List) Optional(java.util.Optional) REGULAR(io.trino.plugin.deltalake.DeltaLakeColumnType.REGULAR) DoubleRange(io.trino.spi.statistics.DoubleRange) Constraint(io.trino.spi.connector.Constraint) DeltaLakeFileStatistics(io.trino.plugin.deltalake.transactionlog.statistics.DeltaLakeFileStatistics) CachingDeltaLakeStatisticsAccess(io.trino.plugin.deltalake.statistics.CachingDeltaLakeStatisticsAccess) ColumnMetadata(io.trino.spi.connector.ColumnMetadata) DeltaLakeColumnStatistics(io.trino.plugin.deltalake.statistics.DeltaLakeColumnStatistics) DELTA_LAKE_INVALID_TABLE(io.trino.plugin.deltalake.DeltaLakeErrorCode.DELTA_LAKE_INVALID_TABLE) OptionalDouble(java.util.OptionalDouble) HashMap(java.util.HashMap) HashSet(java.util.HashSet) HiveMetastore(io.trino.plugin.hive.metastore.HiveMetastore) Objects.requireNonNull(java.util.Objects.requireNonNull) TableStatistics(io.trino.spi.statistics.TableStatistics) ImmutableSet.toImmutableSet(com.google.common.collect.ImmutableSet.toImmutableSet) DeltaLakeSessionProperties.isExtendedStatisticsEnabled(io.trino.plugin.deltalake.DeltaLakeSessionProperties.isExtendedStatisticsEnabled) Estimate(io.trino.spi.statistics.Estimate) ColumnStatistics(io.trino.spi.statistics.ColumnStatistics) PARTITION_KEY(io.trino.plugin.deltalake.DeltaLakeColumnType.PARTITION_KEY) IOException(java.io.IOException) ConnectorSession(io.trino.spi.connector.ConnectorSession) TupleDomain(io.trino.spi.predicate.TupleDomain) DELTA_LAKE_INVALID_SCHEMA(io.trino.plugin.deltalake.DeltaLakeErrorCode.DELTA_LAKE_INVALID_SCHEMA) ProtocolEntry(io.trino.plugin.deltalake.transactionlog.ProtocolEntry) PrincipalPrivileges(io.trino.plugin.hive.metastore.PrincipalPrivileges) TypeManager(io.trino.spi.type.TypeManager) ColumnMetadata(io.trino.spi.connector.ColumnMetadata) Set(java.util.Set) HashSet(java.util.HashSet) ImmutableSet.toImmutableSet(com.google.common.collect.ImmutableSet.toImmutableSet) HashMap(java.util.HashMap) DeltaLakeColumnHandle(io.trino.plugin.deltalake.DeltaLakeColumnHandle) DeltaLakeStatistics(io.trino.plugin.deltalake.statistics.DeltaLakeStatistics) DeltaLakeColumnStatistics(io.trino.plugin.deltalake.statistics.DeltaLakeColumnStatistics) HashSet(java.util.HashSet) DeltaLakeColumnStatistics(io.trino.plugin.deltalake.statistics.DeltaLakeColumnStatistics) ColumnStatistics(io.trino.spi.statistics.ColumnStatistics) DeltaLakeFileStatistics(io.trino.plugin.deltalake.transactionlog.statistics.DeltaLakeFileStatistics) OptionalDouble(java.util.OptionalDouble) OptionalDouble(java.util.OptionalDouble) DoubleRange(io.trino.spi.statistics.DoubleRange) TableSnapshot(io.trino.plugin.deltalake.transactionlog.TableSnapshot) AddFileEntry(io.trino.plugin.deltalake.transactionlog.AddFileEntry) MetadataEntry(io.trino.plugin.deltalake.transactionlog.MetadataEntry) TrinoException(io.trino.spi.TrinoException) TableStatistics(io.trino.spi.statistics.TableStatistics)

Example 4 with REGULAR

use of io.trino.plugin.deltalake.DeltaLakeColumnType.REGULAR in project trino by trinodb.

the class DeltaLakePageSourceProvider method createPageSource.

@Override
public ConnectorPageSource createPageSource(ConnectorTransactionHandle transaction, ConnectorSession session, ConnectorSplit connectorSplit, ConnectorTableHandle connectorTable, List<ColumnHandle> columns, DynamicFilter dynamicFilter) {
    DeltaLakeSplit split = (DeltaLakeSplit) connectorSplit;
    DeltaLakeTableHandle table = (DeltaLakeTableHandle) connectorTable;
    // We reach here when we could not prune the split using file level stats, table predicate
    // and the dynamic filter in the coordinator during split generation. The file level stats
    // in DeltaLakeSplit#filePredicate could help to prune this split when a more selective dynamic filter
    // is available now, without having to access parquet file footer for row-group stats.
    // We avoid sending DeltaLakeSplit#splitPredicate to workers by using table.getPredicate() here.
    TupleDomain<DeltaLakeColumnHandle> filteredSplitPredicate = TupleDomain.intersect(ImmutableList.of(table.getNonPartitionConstraint(), split.getStatisticsPredicate(), dynamicFilter.getCurrentPredicate().transformKeys(DeltaLakeColumnHandle.class::cast)));
    if (filteredSplitPredicate.isNone()) {
        return new EmptyPageSource();
    }
    List<DeltaLakeColumnHandle> deltaLakeColumns = columns.stream().map(DeltaLakeColumnHandle.class::cast).collect(toImmutableList());
    Map<String, Optional<String>> partitionKeys = split.getPartitionKeys();
    List<DeltaLakeColumnHandle> regularColumns = deltaLakeColumns.stream().filter(column -> column.getColumnType() == REGULAR).collect(toImmutableList());
    List<HiveColumnHandle> hiveColumnHandles = regularColumns.stream().map(DeltaLakeColumnHandle::toHiveColumnHandle).collect(toImmutableList());
    Path path = new Path(split.getPath());
    HdfsContext hdfsContext = new HdfsContext(session);
    TupleDomain<HiveColumnHandle> parquetPredicate = getParquetTupleDomain(filteredSplitPredicate.simplify(domainCompactionThreshold));
    if (table.getWriteType().isPresent()) {
        return new DeltaLakeUpdatablePageSource(table, deltaLakeColumns, partitionKeys, split.getPath(), split.getFileSize(), split.getFileModifiedTime(), session, executorService, hdfsEnvironment, hdfsContext, parquetDateTimeZone, parquetReaderOptions, parquetPredicate, typeManager, updateResultJsonCodec);
    }
    ReaderPageSource pageSource = ParquetPageSourceFactory.createPageSource(path, split.getStart(), split.getLength(), split.getFileSize(), hiveColumnHandles, parquetPredicate, true, hdfsEnvironment, hdfsEnvironment.getConfiguration(hdfsContext, path), session.getIdentity(), parquetDateTimeZone, fileFormatDataSourceStats, parquetReaderOptions.withMaxReadBlockSize(getParquetMaxReadBlockSize(session)).withUseColumnIndex(isParquetUseColumnIndex(session)));
    verify(pageSource.getReaderColumns().isEmpty(), "All columns expected to be base columns");
    return new DeltaLakePageSource(deltaLakeColumns, partitionKeys, pageSource.get(), split.getPath(), split.getFileSize(), split.getFileModifiedTime());
}
Also used : DateTimeZone(org.joda.time.DateTimeZone) HiveSessionProperties.isParquetUseColumnIndex(io.trino.plugin.hive.HiveSessionProperties.isParquetUseColumnIndex) Inject(javax.inject.Inject) ParquetPageSourceFactory(io.trino.plugin.hive.parquet.ParquetPageSourceFactory) ImmutableList(com.google.common.collect.ImmutableList) Verify.verify(com.google.common.base.Verify.verify) ConnectorTableHandle(io.trino.spi.connector.ConnectorTableHandle) Map(java.util.Map) Objects.requireNonNull(java.util.Objects.requireNonNull) ColumnHandle(io.trino.spi.connector.ColumnHandle) Path(org.apache.hadoop.fs.Path) ConnectorPageSource(io.trino.spi.connector.ConnectorPageSource) HiveColumnHandle(io.trino.plugin.hive.HiveColumnHandle) ExecutorService(java.util.concurrent.ExecutorService) ParquetReaderOptions(io.trino.parquet.ParquetReaderOptions) FileFormatDataSourceStats(io.trino.plugin.hive.FileFormatDataSourceStats) HdfsEnvironment(io.trino.plugin.hive.HdfsEnvironment) ImmutableMap(com.google.common.collect.ImmutableMap) ConnectorSplit(io.trino.spi.connector.ConnectorSplit) Domain(io.trino.spi.predicate.Domain) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) ConnectorPageSourceProvider(io.trino.spi.connector.ConnectorPageSourceProvider) StandardTypes(io.trino.spi.type.StandardTypes) ConnectorSession(io.trino.spi.connector.ConnectorSession) TupleDomain(io.trino.spi.predicate.TupleDomain) ReaderPageSource(io.trino.plugin.hive.ReaderPageSource) HdfsContext(io.trino.plugin.hive.HdfsEnvironment.HdfsContext) List(java.util.List) DeltaLakeSessionProperties.getParquetMaxReadBlockSize(io.trino.plugin.deltalake.DeltaLakeSessionProperties.getParquetMaxReadBlockSize) DynamicFilter(io.trino.spi.connector.DynamicFilter) Optional(java.util.Optional) ParquetReaderConfig(io.trino.plugin.hive.parquet.ParquetReaderConfig) EmptyPageSource(io.trino.spi.connector.EmptyPageSource) TypeManager(io.trino.spi.type.TypeManager) HiveConfig(io.trino.plugin.hive.HiveConfig) REGULAR(io.trino.plugin.deltalake.DeltaLakeColumnType.REGULAR) JsonCodec(io.airlift.json.JsonCodec) ConnectorTransactionHandle(io.trino.spi.connector.ConnectorTransactionHandle) Path(org.apache.hadoop.fs.Path) Optional(java.util.Optional) EmptyPageSource(io.trino.spi.connector.EmptyPageSource) ReaderPageSource(io.trino.plugin.hive.ReaderPageSource) HdfsContext(io.trino.plugin.hive.HdfsEnvironment.HdfsContext) HiveColumnHandle(io.trino.plugin.hive.HiveColumnHandle)

Example 5 with REGULAR

use of io.trino.plugin.deltalake.DeltaLakeColumnType.REGULAR in project trino by trinodb.

the class TestDeltaLakeCreateTableStatistics method testTimestampMilliSingleRecord.

// int96 timestamp Statistics are only populated if a row group contains a single value
@Test
public void testTimestampMilliSingleRecord() throws IOException {
    String columnName = "t_timestamp";
    DeltaLakeColumnHandle columnHandle = new DeltaLakeColumnHandle(columnName, TIMESTAMP_TZ_MILLIS, REGULAR);
    try (TestTable table = new TestTable("test_timestamp_single_record_", ImmutableList.of(columnName), "VALUES timestamp '2012-10-31 04:00:00.123 America/New_York', timestamp '2012-10-31 01:00:00.123 America/Los_Angeles', null")) {
        List<AddFileEntry> addFileEntries = getAddFileEntries(table.getName());
        AddFileEntry entry = getOnlyElement(addFileEntries);
        assertThat(entry.getStats()).isPresent();
        DeltaLakeFileStatistics fileStatistics = entry.getStats().get();
        assertEquals(fileStatistics.getNumRecords(), Optional.of(3L));
        Function<String, Long> timestampValueConverter = valueString -> {
            ZonedDateTime zonedDateTime = ZonedDateTime.parse(valueString);
            Instant instant = zonedDateTime.toInstant();
            return packDateTimeWithZone(instant.toEpochMilli(), UTC_KEY);
        };
        assertEquals(fileStatistics.getMinColumnValue(columnHandle), Optional.of(timestampValueConverter.apply("2012-10-31T08:00:00.123Z")));
        assertEquals(fileStatistics.getMaxColumnValue(columnHandle), Optional.of(timestampValueConverter.apply("2012-10-31T08:00:00.123Z")));
        assertEquals(fileStatistics.getNullCount(columnName), Optional.of(1L));
    }
}
Also used : DeltaLakeFileStatistics(io.trino.plugin.deltalake.transactionlog.statistics.DeltaLakeFileStatistics) ImmutableMap(com.google.common.collect.ImmutableMap) UTC_KEY(io.trino.spi.type.TimeZoneKey.UTC_KEY) ZonedDateTime(java.time.ZonedDateTime) Assertions.assertThat(org.assertj.core.api.Assertions.assertThat) DateTimeEncoding.packDateTimeWithZone(io.trino.spi.type.DateTimeEncoding.packDateTimeWithZone) Assert.assertEquals(org.testng.Assert.assertEquals) Test(org.testng.annotations.Test) IOException(java.io.IOException) Iterables.getOnlyElement(com.google.common.collect.Iterables.getOnlyElement) Instant(java.time.Instant) Function(java.util.function.Function) AddFileEntry(io.trino.plugin.deltalake.transactionlog.AddFileEntry) ParquetWriterConfig(io.trino.plugin.hive.parquet.ParquetWriterConfig) List(java.util.List) TIMESTAMP_TZ_MILLIS(io.trino.spi.type.TimestampWithTimeZoneType.TIMESTAMP_TZ_MILLIS) ImmutableList(com.google.common.collect.ImmutableList) Verify.verify(com.google.common.base.Verify.verify) QueryRunner(io.trino.testing.QueryRunner) Map(java.util.Map) Optional(java.util.Optional) REGULAR(io.trino.plugin.deltalake.DeltaLakeColumnType.REGULAR) ZonedDateTime(java.time.ZonedDateTime) Instant(java.time.Instant) AddFileEntry(io.trino.plugin.deltalake.transactionlog.AddFileEntry) DeltaLakeFileStatistics(io.trino.plugin.deltalake.transactionlog.statistics.DeltaLakeFileStatistics) Test(org.testng.annotations.Test)

Aggregations

REGULAR (io.trino.plugin.deltalake.DeltaLakeColumnType.REGULAR)5 List (java.util.List)5 Optional (java.util.Optional)5 ImmutableList (com.google.common.collect.ImmutableList)4 ImmutableMap (com.google.common.collect.ImmutableMap)4 Map (java.util.Map)4 ImmutableList.toImmutableList (com.google.common.collect.ImmutableList.toImmutableList)3 AddFileEntry (io.trino.plugin.deltalake.transactionlog.AddFileEntry)3 DeltaLakeFileStatistics (io.trino.plugin.deltalake.transactionlog.statistics.DeltaLakeFileStatistics)3 ColumnMetadata (io.trino.spi.connector.ColumnMetadata)3 TupleDomain (io.trino.spi.predicate.TupleDomain)3 TypeManager (io.trino.spi.type.TypeManager)3 IOException (java.io.IOException)3 Verify.verify (com.google.common.base.Verify.verify)2 DeltaLakeColumnHandle (io.trino.plugin.deltalake.DeltaLakeColumnHandle)2 MetadataEntry (io.trino.plugin.deltalake.transactionlog.MetadataEntry)2 ProtocolEntry (io.trino.plugin.deltalake.transactionlog.ProtocolEntry)2 TableSnapshot (io.trino.plugin.deltalake.transactionlog.TableSnapshot)2 SchemaTableName (io.trino.spi.connector.SchemaTableName)2 Assert.assertEquals (io.trino.testing.assertions.Assert.assertEquals)2