Search in sources :

Example 6 with MetadataEntry

use of io.trino.plugin.deltalake.transactionlog.MetadataEntry in project trino by trinodb.

the class HiveMetastoreBackedDeltaLakeMetastore method createTable.

@Override
public void createTable(ConnectorSession session, Table table, PrincipalPrivileges principalPrivileges) {
    String tableLocation = table.getStorage().getLocation();
    statisticsAccess.invalidateCache(tableLocation);
    transactionLogAccess.invalidateCaches(tableLocation);
    try {
        TableSnapshot tableSnapshot = transactionLogAccess.loadSnapshot(table.getSchemaTableName(), new Path(tableLocation), session);
        Optional<MetadataEntry> maybeMetadata = transactionLogAccess.getMetadataEntry(tableSnapshot, session);
        if (maybeMetadata.isEmpty()) {
            throw new TrinoException(DELTA_LAKE_INVALID_TABLE, "Provided location did not contain a valid Delta Lake table: " + tableLocation);
        }
    } catch (IOException e) {
        throw new TrinoException(DELTA_LAKE_INVALID_TABLE, "Failed to access table location: " + tableLocation, e);
    }
    delegate.createTable(table, principalPrivileges);
}
Also used : Path(org.apache.hadoop.fs.Path) TableSnapshot(io.trino.plugin.deltalake.transactionlog.TableSnapshot) MetadataEntry(io.trino.plugin.deltalake.transactionlog.MetadataEntry) TrinoException(io.trino.spi.TrinoException) IOException(java.io.IOException)

Example 7 with MetadataEntry

use of io.trino.plugin.deltalake.transactionlog.MetadataEntry in project trino by trinodb.

the class HiveMetastoreBackedDeltaLakeMetastore method getTableStatistics.

@Override
public TableStatistics getTableStatistics(ConnectorSession session, DeltaLakeTableHandle tableHandle, Constraint constraint) {
    TableSnapshot tableSnapshot = getSnapshot(tableHandle.getSchemaTableName(), session);
    double numRecords = 0L;
    MetadataEntry metadata = transactionLogAccess.getMetadataEntry(tableSnapshot, session).orElseThrow(() -> new TrinoException(DELTA_LAKE_INVALID_SCHEMA, "Metadata not found in transaction log for " + tableHandle.getTableName()));
    List<ColumnMetadata> columnMetadata = DeltaLakeSchemaSupport.extractSchema(metadata, typeManager);
    List<DeltaLakeColumnHandle> columns = columnMetadata.stream().map(columnMeta -> new DeltaLakeColumnHandle(columnMeta.getName(), columnMeta.getType(), metadata.getCanonicalPartitionColumns().contains(columnMeta.getName()) ? PARTITION_KEY : REGULAR)).collect(toImmutableList());
    Map<DeltaLakeColumnHandle, Double> nullCounts = new HashMap<>();
    columns.forEach(column -> nullCounts.put(column, 0.0));
    Map<DeltaLakeColumnHandle, Double> minValues = new HashMap<>();
    Map<DeltaLakeColumnHandle, Double> maxValues = new HashMap<>();
    Map<DeltaLakeColumnHandle, Set<String>> partitioningColumnsDistinctValues = new HashMap<>();
    columns.stream().filter(column -> column.getColumnType() == PARTITION_KEY).forEach(column -> partitioningColumnsDistinctValues.put(column, new HashSet<>()));
    if (tableHandle.getEnforcedPartitionConstraint().isNone() || tableHandle.getNonPartitionConstraint().isNone() || constraint.getSummary().isNone()) {
        return createZeroStatistics(columns);
    }
    Set<String> predicatedColumnNames = tableHandle.getNonPartitionConstraint().getDomains().orElseThrow().keySet().stream().map(DeltaLakeColumnHandle::getName).collect(toImmutableSet());
    List<ColumnMetadata> predicatedColumns = columnMetadata.stream().filter(column -> predicatedColumnNames.contains(column.getName())).collect(toImmutableList());
    for (AddFileEntry addEntry : transactionLogAccess.getActiveFiles(tableSnapshot, session)) {
        Optional<? extends DeltaLakeFileStatistics> fileStatistics = addEntry.getStats();
        if (fileStatistics.isEmpty()) {
            // Open source Delta Lake does not collect stats
            return TableStatistics.empty();
        }
        DeltaLakeFileStatistics stats = fileStatistics.get();
        if (!partitionMatchesPredicate(addEntry.getCanonicalPartitionValues(), tableHandle.getEnforcedPartitionConstraint().getDomains().orElseThrow())) {
            continue;
        }
        TupleDomain<DeltaLakeColumnHandle> statisticsPredicate = createStatisticsPredicate(addEntry, predicatedColumns, tableHandle.getMetadataEntry().getCanonicalPartitionColumns());
        if (!tableHandle.getNonPartitionConstraint().overlaps(statisticsPredicate)) {
            continue;
        }
        if (stats.getNumRecords().isEmpty()) {
            // Not clear if it's possible for stats to be present with no row count, but bail out if that happens
            return TableStatistics.empty();
        }
        numRecords += stats.getNumRecords().get();
        for (DeltaLakeColumnHandle column : columns) {
            if (column.getColumnType() == PARTITION_KEY) {
                Optional<String> partitionValue = addEntry.getCanonicalPartitionValues().get(column.getName());
                if (partitionValue.isEmpty()) {
                    nullCounts.merge(column, (double) stats.getNumRecords().get(), Double::sum);
                } else {
                    // NULL is not counted as a distinct value
                    // Code below assumes that values returned by addEntry.getCanonicalPartitionValues() are normalized,
                    // it may not be true in case of real, doubles, timestamps etc
                    partitioningColumnsDistinctValues.get(column).add(partitionValue.get());
                }
            } else {
                Optional<Long> maybeNullCount = stats.getNullCount(column.getName());
                if (maybeNullCount.isPresent()) {
                    nullCounts.put(column, nullCounts.get(column) + maybeNullCount.get());
                } else {
                    // If any individual file fails to report null counts, fail to calculate the total for the table
                    nullCounts.put(column, NaN);
                }
            }
            // Math.min returns NaN if any operand is NaN
            stats.getMinColumnValue(column).map(parsedValue -> toStatsRepresentation(column.getType(), parsedValue)).filter(OptionalDouble::isPresent).map(OptionalDouble::getAsDouble).ifPresent(parsedValueAsDouble -> minValues.merge(column, parsedValueAsDouble, Math::min));
            stats.getMaxColumnValue(column).map(parsedValue -> toStatsRepresentation(column.getType(), parsedValue)).filter(OptionalDouble::isPresent).map(OptionalDouble::getAsDouble).ifPresent(parsedValueAsDouble -> maxValues.merge(column, parsedValueAsDouble, Math::max));
        }
    }
    if (numRecords == 0) {
        return createZeroStatistics(columns);
    }
    TableStatistics.Builder statsBuilder = new TableStatistics.Builder().setRowCount(Estimate.of(numRecords));
    Optional<DeltaLakeStatistics> statistics = Optional.empty();
    if (isExtendedStatisticsEnabled(session)) {
        statistics = statisticsAccess.readDeltaLakeStatistics(session, tableHandle.getLocation());
    }
    for (DeltaLakeColumnHandle column : columns) {
        ColumnStatistics.Builder columnStatsBuilder = new ColumnStatistics.Builder();
        Double nullCount = nullCounts.get(column);
        columnStatsBuilder.setNullsFraction(nullCount.isNaN() ? Estimate.unknown() : Estimate.of(nullCount / numRecords));
        Double maxValue = maxValues.get(column);
        Double minValue = minValues.get(column);
        if (isValidInRange(maxValue) && isValidInRange(minValue)) {
            columnStatsBuilder.setRange(new DoubleRange(minValue, maxValue));
        } else if (isValidInRange(maxValue)) {
            columnStatsBuilder.setRange(new DoubleRange(NEGATIVE_INFINITY, maxValue));
        } else if (isValidInRange(minValue)) {
            columnStatsBuilder.setRange(new DoubleRange(minValue, POSITIVE_INFINITY));
        }
        // extend statistics with NDV
        if (column.getColumnType() == PARTITION_KEY) {
            columnStatsBuilder.setDistinctValuesCount(Estimate.of(partitioningColumnsDistinctValues.get(column).size()));
        }
        if (statistics.isPresent()) {
            DeltaLakeColumnStatistics deltaLakeColumnStatistics = statistics.get().getColumnStatistics().get(column.getName());
            if (deltaLakeColumnStatistics != null && column.getColumnType() != PARTITION_KEY) {
                columnStatsBuilder.setDistinctValuesCount(Estimate.of(deltaLakeColumnStatistics.getNdvSummary().cardinality()));
            }
        }
        statsBuilder.setColumnStatistics(column, columnStatsBuilder.build());
    }
    return statsBuilder.build();
}
Also used : DeltaLakeStatistics(io.trino.plugin.deltalake.statistics.DeltaLakeStatistics) POSITIVE_INFINITY(java.lang.Double.POSITIVE_INFINITY) PATH_PROPERTY(io.trino.plugin.deltalake.DeltaLakeMetadata.PATH_PROPERTY) TableSnapshot(io.trino.plugin.deltalake.transactionlog.TableSnapshot) Database(io.trino.plugin.hive.metastore.Database) NEGATIVE_INFINITY(java.lang.Double.NEGATIVE_INFINITY) AddFileEntry(io.trino.plugin.deltalake.transactionlog.AddFileEntry) TransactionLogAccess(io.trino.plugin.deltalake.transactionlog.TransactionLogAccess) StatsUtil.toStatsRepresentation(io.trino.spi.statistics.StatsUtil.toStatsRepresentation) TableNotFoundException(io.trino.spi.connector.TableNotFoundException) DeltaLakeMetadata.createStatisticsPredicate(io.trino.plugin.deltalake.DeltaLakeMetadata.createStatisticsPredicate) NaN(java.lang.Double.NaN) DeltaLakeColumnHandle(io.trino.plugin.deltalake.DeltaLakeColumnHandle) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) DeltaLakeTableHandle(io.trino.plugin.deltalake.DeltaLakeTableHandle) Table(io.trino.plugin.hive.metastore.Table) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) MetadataEntry(io.trino.plugin.deltalake.transactionlog.MetadataEntry) Set(java.util.Set) DeltaLakeSplitManager.partitionMatchesPredicate(io.trino.plugin.deltalake.DeltaLakeSplitManager.partitionMatchesPredicate) DeltaLakeSchemaSupport(io.trino.plugin.deltalake.transactionlog.DeltaLakeSchemaSupport) TrinoException(io.trino.spi.TrinoException) SchemaTableName(io.trino.spi.connector.SchemaTableName) String.format(java.lang.String.format) List(java.util.List) Optional(java.util.Optional) REGULAR(io.trino.plugin.deltalake.DeltaLakeColumnType.REGULAR) DoubleRange(io.trino.spi.statistics.DoubleRange) Constraint(io.trino.spi.connector.Constraint) DeltaLakeFileStatistics(io.trino.plugin.deltalake.transactionlog.statistics.DeltaLakeFileStatistics) CachingDeltaLakeStatisticsAccess(io.trino.plugin.deltalake.statistics.CachingDeltaLakeStatisticsAccess) ColumnMetadata(io.trino.spi.connector.ColumnMetadata) DeltaLakeColumnStatistics(io.trino.plugin.deltalake.statistics.DeltaLakeColumnStatistics) DELTA_LAKE_INVALID_TABLE(io.trino.plugin.deltalake.DeltaLakeErrorCode.DELTA_LAKE_INVALID_TABLE) OptionalDouble(java.util.OptionalDouble) HashMap(java.util.HashMap) HashSet(java.util.HashSet) HiveMetastore(io.trino.plugin.hive.metastore.HiveMetastore) Objects.requireNonNull(java.util.Objects.requireNonNull) TableStatistics(io.trino.spi.statistics.TableStatistics) ImmutableSet.toImmutableSet(com.google.common.collect.ImmutableSet.toImmutableSet) DeltaLakeSessionProperties.isExtendedStatisticsEnabled(io.trino.plugin.deltalake.DeltaLakeSessionProperties.isExtendedStatisticsEnabled) Estimate(io.trino.spi.statistics.Estimate) ColumnStatistics(io.trino.spi.statistics.ColumnStatistics) PARTITION_KEY(io.trino.plugin.deltalake.DeltaLakeColumnType.PARTITION_KEY) IOException(java.io.IOException) ConnectorSession(io.trino.spi.connector.ConnectorSession) TupleDomain(io.trino.spi.predicate.TupleDomain) DELTA_LAKE_INVALID_SCHEMA(io.trino.plugin.deltalake.DeltaLakeErrorCode.DELTA_LAKE_INVALID_SCHEMA) ProtocolEntry(io.trino.plugin.deltalake.transactionlog.ProtocolEntry) PrincipalPrivileges(io.trino.plugin.hive.metastore.PrincipalPrivileges) TypeManager(io.trino.spi.type.TypeManager) ColumnMetadata(io.trino.spi.connector.ColumnMetadata) Set(java.util.Set) HashSet(java.util.HashSet) ImmutableSet.toImmutableSet(com.google.common.collect.ImmutableSet.toImmutableSet) HashMap(java.util.HashMap) DeltaLakeColumnHandle(io.trino.plugin.deltalake.DeltaLakeColumnHandle) DeltaLakeStatistics(io.trino.plugin.deltalake.statistics.DeltaLakeStatistics) DeltaLakeColumnStatistics(io.trino.plugin.deltalake.statistics.DeltaLakeColumnStatistics) HashSet(java.util.HashSet) DeltaLakeColumnStatistics(io.trino.plugin.deltalake.statistics.DeltaLakeColumnStatistics) ColumnStatistics(io.trino.spi.statistics.ColumnStatistics) DeltaLakeFileStatistics(io.trino.plugin.deltalake.transactionlog.statistics.DeltaLakeFileStatistics) OptionalDouble(java.util.OptionalDouble) OptionalDouble(java.util.OptionalDouble) DoubleRange(io.trino.spi.statistics.DoubleRange) TableSnapshot(io.trino.plugin.deltalake.transactionlog.TableSnapshot) AddFileEntry(io.trino.plugin.deltalake.transactionlog.AddFileEntry) MetadataEntry(io.trino.plugin.deltalake.transactionlog.MetadataEntry) TrinoException(io.trino.spi.TrinoException) TableStatistics(io.trino.spi.statistics.TableStatistics)

Example 8 with MetadataEntry

use of io.trino.plugin.deltalake.transactionlog.MetadataEntry in project trino by trinodb.

the class DeltaLakeMetadata method getTableHandleForStatisticsCollection.

@Nullable
@Override
public ConnectorTableHandle getTableHandleForStatisticsCollection(ConnectorSession session, SchemaTableName tableName, Map<String, Object> analyzeProperties) {
    Optional<Table> table = metastore.getTable(tableName.getSchemaName(), tableName.getTableName());
    if (table.isEmpty()) {
        return null;
    }
    if (!isExtendedStatisticsEnabled(session)) {
        throw new TrinoException(NOT_SUPPORTED, "ANALYZE not supported if extended statistics are disabled. Enable via delta.extended-statistics.enabled config property or extended_statistics_enabled session property.");
    }
    Optional<Instant> filesModifiedAfterFromProperties = DeltaLakeAnalyzeProperties.getFilesModifiedAfterProperty(analyzeProperties);
    TableSnapshot tableSnapshot = metastore.getSnapshot(tableName, session);
    long version = tableSnapshot.getVersion();
    String tableLocation = metastore.getTableLocation(tableName, session);
    Optional<DeltaLakeStatistics> statistics = statisticsAccess.readDeltaLakeStatistics(session, tableLocation);
    Optional<Instant> alreadyAnalyzedModifiedTimeMax = statistics.map(DeltaLakeStatistics::getAlreadyAnalyzedModifiedTimeMax);
    // determine list of files we want to read based on what caller requested via files_modified_after and what files were already analyzed in the past
    Optional<Instant> filesModifiedAfter = Optional.empty();
    if (filesModifiedAfterFromProperties.isPresent() || alreadyAnalyzedModifiedTimeMax.isPresent()) {
        filesModifiedAfter = Optional.of(Comparators.max(filesModifiedAfterFromProperties.orElse(Instant.ofEpochMilli(0)), alreadyAnalyzedModifiedTimeMax.orElse(Instant.ofEpochMilli(0))));
    }
    MetadataEntry metadata = metastore.getMetadata(tableSnapshot, session).orElseThrow(() -> new TrinoException(DELTA_LAKE_INVALID_SCHEMA, "Metadata not found in transaction log for " + table));
    Optional<Set<String>> analyzeColumnNames = DeltaLakeAnalyzeProperties.getColumnNames(analyzeProperties);
    if (analyzeColumnNames.isPresent()) {
        Set<String> columnNames = analyzeColumnNames.get();
        // validate that proper column names are passed via `columns` analyze property
        if (columnNames.isEmpty()) {
            throw new TrinoException(INVALID_ANALYZE_PROPERTY, "Cannot specify empty list of columns for analysis");
        }
        Set<String> allColumnNames = extractSchema(metadata, typeManager).stream().map(ColumnMetadata::getName).collect(toImmutableSet());
        if (!allColumnNames.containsAll(columnNames)) {
            throw new TrinoException(INVALID_ANALYZE_PROPERTY, format("Invalid columns specified for analysis: %s", Sets.difference(columnNames, allColumnNames)));
        }
    }
    // verify that we do not extend set of analyzed columns
    Optional<Set<String>> oldAnalyzeColumnNames = statistics.flatMap(DeltaLakeStatistics::getAnalyzedColumns);
    if (oldAnalyzeColumnNames.isPresent()) {
        if (analyzeColumnNames.isEmpty() || !oldAnalyzeColumnNames.get().containsAll(analyzeColumnNames.get())) {
            throw new TrinoException(INVALID_ANALYZE_PROPERTY, "List of columns to be analyzed must be a subset of previously used. To extend list of analyzed columns drop table statistics");
        }
    }
    AnalyzeHandle analyzeHandle = new AnalyzeHandle(version, statistics.isEmpty(), filesModifiedAfter, analyzeColumnNames);
    return new DeltaLakeTableHandle(tableName.getSchemaName(), tableName.getTableName(), tableLocation, Optional.of(metadata), TupleDomain.all(), TupleDomain.all(), Optional.empty(), Optional.empty(), Optional.empty(), Optional.empty(), Optional.of(analyzeHandle), version);
}
Also used : Table(io.trino.plugin.hive.metastore.Table) HiveUtil.isDeltaLakeTable(io.trino.plugin.hive.util.HiveUtil.isDeltaLakeTable) Set(java.util.Set) ImmutableSet.toImmutableSet(com.google.common.collect.ImmutableSet.toImmutableSet) ImmutableSet(com.google.common.collect.ImmutableSet) MetastoreUtil.buildInitialPrivilegeSet(io.trino.plugin.hive.metastore.MetastoreUtil.buildInitialPrivilegeSet) Instant(java.time.Instant) DeltaLakeStatistics(io.trino.plugin.deltalake.statistics.DeltaLakeStatistics) TableSnapshot(io.trino.plugin.deltalake.transactionlog.TableSnapshot) TrinoException(io.trino.spi.TrinoException) MetadataEntry(io.trino.plugin.deltalake.transactionlog.MetadataEntry) Nullable(javax.annotation.Nullable)

Example 9 with MetadataEntry

use of io.trino.plugin.deltalake.transactionlog.MetadataEntry in project trino by trinodb.

the class DeltaLakeMetadata method appendInitialTableEntries.

private static void appendInitialTableEntries(TransactionLogWriter transactionLogWriter, List<DeltaLakeColumnHandle> columns, List<String> partitionColumnNames, Map<String, String> configuration, String operation, ConnectorSession session, String nodeVersion, String nodeId) {
    long createdTime = System.currentTimeMillis();
    transactionLogWriter.appendCommitInfoEntry(new CommitInfoEntry(0, createdTime, session.getUser(), session.getUser(), operation, ImmutableMap.of("queryId", session.getQueryId()), null, null, "trino-" + nodeVersion + "-" + nodeId, 0, ISOLATION_LEVEL, true));
    transactionLogWriter.appendProtocolEntry(new ProtocolEntry(READER_VERSION, WRITER_VERSION));
    transactionLogWriter.appendMetadataEntry(new MetadataEntry(randomUUID().toString(), null, null, new Format("parquet", ImmutableMap.of()), serializeSchemaAsJson(columns), partitionColumnNames, ImmutableMap.copyOf(configuration), createdTime));
}
Also used : Format(io.trino.plugin.deltalake.transactionlog.MetadataEntry.Format) StorageFormat(io.trino.plugin.hive.metastore.StorageFormat) ProtocolEntry(io.trino.plugin.deltalake.transactionlog.ProtocolEntry) MetadataEntry(io.trino.plugin.deltalake.transactionlog.MetadataEntry) CommitInfoEntry(io.trino.plugin.deltalake.transactionlog.CommitInfoEntry)

Example 10 with MetadataEntry

use of io.trino.plugin.deltalake.transactionlog.MetadataEntry in project trino by trinodb.

the class CheckpointEntryIterator method buildMetadataEntry.

private DeltaLakeTransactionLogEntry buildMetadataEntry(ConnectorSession session, Block block, int pagePosition) {
    log.debug("Building metadata entry from %s pagePosition %d", block, pagePosition);
    if (block.isNull(pagePosition)) {
        return null;
    }
    int metadataFields = 8;
    int formatFields = 2;
    Block metadataEntryBlock = block.getObject(pagePosition, Block.class);
    log.debug("Block %s has %s fields", block, metadataEntryBlock.getPositionCount());
    if (metadataEntryBlock.getPositionCount() != metadataFields) {
        throw new TrinoException(DELTA_LAKE_INVALID_SCHEMA, format("Expected block %s to have %d children, but found %s", block, metadataFields, metadataEntryBlock.getPositionCount()));
    }
    Block formatBlock = metadataEntryBlock.getObject(3, Block.class);
    if (formatBlock.getPositionCount() != formatFields) {
        throw new TrinoException(DELTA_LAKE_INVALID_SCHEMA, format("Expected block %s to have %d children, but found %s", formatBlock, formatFields, formatBlock.getPositionCount()));
    }
    MetadataEntry result = new MetadataEntry(getString(metadataEntryBlock, 0), getString(metadataEntryBlock, 1), getString(metadataEntryBlock, 2), new MetadataEntry.Format(getString(formatBlock, 0), getMap(formatBlock, 1)), getString(metadataEntryBlock, 4), getList(metadataEntryBlock, 5), getMap(metadataEntryBlock, 6), getLong(metadataEntryBlock, 7));
    log.debug("Result: %s", result);
    return DeltaLakeTransactionLogEntry.metadataEntry(result);
}
Also used : Block(io.trino.spi.block.Block) TrinoException(io.trino.spi.TrinoException) MetadataEntry(io.trino.plugin.deltalake.transactionlog.MetadataEntry)

Aggregations

MetadataEntry (io.trino.plugin.deltalake.transactionlog.MetadataEntry)16 Test (org.testng.annotations.Test)9 ProtocolEntry (io.trino.plugin.deltalake.transactionlog.ProtocolEntry)6 AddFileEntry (io.trino.plugin.deltalake.transactionlog.AddFileEntry)5 Path (org.apache.hadoop.fs.Path)5 TableSnapshot (io.trino.plugin.deltalake.transactionlog.TableSnapshot)4 Table (io.trino.plugin.hive.metastore.Table)4 TrinoException (io.trino.spi.TrinoException)4 RemoveFileEntry (io.trino.plugin.deltalake.transactionlog.RemoveFileEntry)3 Block (io.trino.spi.block.Block)3 ImmutableSet.toImmutableSet (com.google.common.collect.ImmutableSet.toImmutableSet)2 DeltaLakeTableHandle (io.trino.plugin.deltalake.DeltaLakeTableHandle)2 DeltaLakeStatistics (io.trino.plugin.deltalake.statistics.DeltaLakeStatistics)2 DeltaLakeTransactionLogEntry (io.trino.plugin.deltalake.transactionlog.DeltaLakeTransactionLogEntry)2 TransactionEntry (io.trino.plugin.deltalake.transactionlog.TransactionEntry)2 DeltaLakeParquetFileStatistics (io.trino.plugin.deltalake.transactionlog.statistics.DeltaLakeParquetFileStatistics)2 RowBlock (io.trino.spi.block.RowBlock)2 Utils.nativeValueToBlock (io.trino.spi.predicate.Utils.nativeValueToBlock)2 TypeManager (io.trino.spi.type.TypeManager)2 File (java.io.File)2