use of io.trino.plugin.deltalake.DeltaLakeColumnHandle in project trino by trinodb.
the class TestDeltaLakeSchemaSupport method testRoundTripComplexSchema.
@Test
public void testRoundTripComplexSchema() throws IOException, URISyntaxException {
URL expected = getResource("io/trino/plugin/deltalake/transactionlog/schema/complex_schema.json");
String json = Files.readString(Path.of(expected.toURI()));
List<ColumnMetadata> schema = DeltaLakeSchemaSupport.getColumnMetadata(json, typeManager);
List<DeltaLakeColumnHandle> columnHandles = schema.stream().map(metadata -> new DeltaLakeColumnHandle(metadata.getName(), metadata.getType(), REGULAR)).collect(toImmutableList());
ObjectMapper objectMapper = new ObjectMapper();
assertEquals(objectMapper.readTree(serializeSchemaAsJson(columnHandles)), objectMapper.readTree(json));
}
use of io.trino.plugin.deltalake.DeltaLakeColumnHandle in project trino by trinodb.
the class TestDeltaLakeSchemaSupport method testSerializeSchemaAsJson.
@Test
public void testSerializeSchemaAsJson() throws Exception {
DeltaLakeColumnHandle arrayColumn = new DeltaLakeColumnHandle("arr", new ArrayType(new ArrayType(INTEGER)), REGULAR);
DeltaLakeColumnHandle structColumn = new DeltaLakeColumnHandle("str", RowType.from(ImmutableList.of(new RowType.Field(Optional.of("s1"), VarcharType.createUnboundedVarcharType()), new RowType.Field(Optional.of("s2"), RowType.from(ImmutableList.of(new RowType.Field(Optional.of("i1"), INTEGER), new RowType.Field(Optional.of("d2"), DecimalType.createDecimalType(38, 0))))))), REGULAR);
TypeOperators typeOperators = new TypeOperators();
DeltaLakeColumnHandle mapColumn = new DeltaLakeColumnHandle("m", new MapType(INTEGER, new MapType(INTEGER, INTEGER, typeOperators), typeOperators), REGULAR);
URL expected = getResource("io/trino/plugin/deltalake/transactionlog/schema/nested_schema.json");
ObjectMapper objectMapper = new ObjectMapper();
String jsonEncoding = serializeSchemaAsJson(ImmutableList.of(arrayColumn, structColumn, mapColumn));
assertEquals(objectMapper.readTree(jsonEncoding), objectMapper.readTree(expected));
}
use of io.trino.plugin.deltalake.DeltaLakeColumnHandle in project trino by trinodb.
the class TestDeltaLakeMetastoreStatistics method testStatisticsParquetParsedStatistics.
@Test
public void testStatisticsParquetParsedStatistics() {
// The transaction log for this table was created so that the checkpoints only write struct statistics, not json statistics
DeltaLakeTableHandle tableHandle = registerTable("parquet_struct_statistics");
TableStatistics stats = deltaLakeMetastore.getTableStatistics(SESSION, tableHandle, Constraint.alwaysTrue());
assertEquals(stats.getRowCount(), Estimate.of(9));
Map<ColumnHandle, ColumnStatistics> statisticsMap = stats.getColumnStatistics();
ColumnStatistics columnStats = statisticsMap.get(new DeltaLakeColumnHandle("dec_short", DecimalType.createDecimalType(5, 1), REGULAR));
assertEquals(columnStats.getNullsFraction(), Estimate.zero());
assertEquals(columnStats.getRange().get().getMin(), -10.1);
assertEquals(columnStats.getRange().get().getMax(), 10.1);
columnStats = statisticsMap.get(new DeltaLakeColumnHandle("dec_long", DecimalType.createDecimalType(25, 3), REGULAR));
assertEquals(columnStats.getNullsFraction(), Estimate.zero());
assertEquals(columnStats.getRange().get().getMin(), -999999999999.123);
assertEquals(columnStats.getRange().get().getMax(), 999999999999.123);
columnStats = statisticsMap.get(new DeltaLakeColumnHandle("l", BIGINT, REGULAR));
assertEquals(columnStats.getNullsFraction(), Estimate.zero());
assertEquals(columnStats.getRange().get().getMin(), -10000000.0);
assertEquals(columnStats.getRange().get().getMax(), 10000000.0);
columnStats = statisticsMap.get(new DeltaLakeColumnHandle("in", INTEGER, REGULAR));
assertEquals(columnStats.getNullsFraction(), Estimate.zero());
assertEquals(columnStats.getRange().get().getMin(), -20000000.0);
assertEquals(columnStats.getRange().get().getMax(), 20000000.0);
columnStats = statisticsMap.get(new DeltaLakeColumnHandle("sh", SMALLINT, REGULAR));
assertEquals(columnStats.getNullsFraction(), Estimate.zero());
assertEquals(columnStats.getRange().get().getMin(), -123.0);
assertEquals(columnStats.getRange().get().getMax(), 123.0);
columnStats = statisticsMap.get(new DeltaLakeColumnHandle("byt", TINYINT, REGULAR));
assertEquals(columnStats.getNullsFraction(), Estimate.zero());
assertEquals(columnStats.getRange().get().getMin(), -42.0);
assertEquals(columnStats.getRange().get().getMax(), 42.0);
columnStats = statisticsMap.get(new DeltaLakeColumnHandle("fl", REAL, REGULAR));
assertEquals(columnStats.getNullsFraction(), Estimate.zero());
assertEquals((float) columnStats.getRange().get().getMin(), -0.123f);
assertEquals((float) columnStats.getRange().get().getMax(), 0.123f);
columnStats = statisticsMap.get(new DeltaLakeColumnHandle("dou", DOUBLE, REGULAR));
assertEquals(columnStats.getNullsFraction(), Estimate.zero());
assertEquals(columnStats.getRange().get().getMin(), -0.321);
assertEquals(columnStats.getRange().get().getMax(), 0.321);
columnStats = statisticsMap.get(new DeltaLakeColumnHandle("dat", DATE, REGULAR));
assertEquals(columnStats.getNullsFraction(), Estimate.zero());
assertEquals(columnStats.getRange().get().getMin(), (double) LocalDate.parse("1900-01-01").toEpochDay());
assertEquals(columnStats.getRange().get().getMax(), (double) LocalDate.parse("5000-01-01").toEpochDay());
}
use of io.trino.plugin.deltalake.DeltaLakeColumnHandle in project trino by trinodb.
the class TestDeltaLakeMetastoreStatistics method testStatisticsParquetParsedStatisticsNaNValues.
@Test
public void testStatisticsParquetParsedStatisticsNaNValues() {
// The transaction log for this table was created so that the checkpoints only write struct statistics, not json statistics
// The table has a REAL and DOUBLE columns each with 9 values, one of them being NaN
DeltaLakeTableHandle tableHandle = registerTable("parquet_struct_statistics_nan");
TableStatistics stats = deltaLakeMetastore.getTableStatistics(SESSION, tableHandle, Constraint.alwaysTrue());
assertEquals(stats.getRowCount(), Estimate.of(9));
Map<ColumnHandle, ColumnStatistics> statisticsMap = stats.getColumnStatistics();
ColumnStatistics columnStats = statisticsMap.get(new DeltaLakeColumnHandle("fl", REAL, REGULAR));
assertEquals(columnStats.getNullsFraction(), Estimate.zero());
assertThat(columnStats.getRange()).isEmpty();
columnStats = statisticsMap.get(new DeltaLakeColumnHandle("dou", DOUBLE, REGULAR));
assertEquals(columnStats.getNullsFraction(), Estimate.zero());
assertThat(columnStats.getRange()).isEmpty();
}
use of io.trino.plugin.deltalake.DeltaLakeColumnHandle in project trino by trinodb.
the class HiveMetastoreBackedDeltaLakeMetastore method getTableStatistics.
@Override
public TableStatistics getTableStatistics(ConnectorSession session, DeltaLakeTableHandle tableHandle, Constraint constraint) {
TableSnapshot tableSnapshot = getSnapshot(tableHandle.getSchemaTableName(), session);
double numRecords = 0L;
MetadataEntry metadata = transactionLogAccess.getMetadataEntry(tableSnapshot, session).orElseThrow(() -> new TrinoException(DELTA_LAKE_INVALID_SCHEMA, "Metadata not found in transaction log for " + tableHandle.getTableName()));
List<ColumnMetadata> columnMetadata = DeltaLakeSchemaSupport.extractSchema(metadata, typeManager);
List<DeltaLakeColumnHandle> columns = columnMetadata.stream().map(columnMeta -> new DeltaLakeColumnHandle(columnMeta.getName(), columnMeta.getType(), metadata.getCanonicalPartitionColumns().contains(columnMeta.getName()) ? PARTITION_KEY : REGULAR)).collect(toImmutableList());
Map<DeltaLakeColumnHandle, Double> nullCounts = new HashMap<>();
columns.forEach(column -> nullCounts.put(column, 0.0));
Map<DeltaLakeColumnHandle, Double> minValues = new HashMap<>();
Map<DeltaLakeColumnHandle, Double> maxValues = new HashMap<>();
Map<DeltaLakeColumnHandle, Set<String>> partitioningColumnsDistinctValues = new HashMap<>();
columns.stream().filter(column -> column.getColumnType() == PARTITION_KEY).forEach(column -> partitioningColumnsDistinctValues.put(column, new HashSet<>()));
if (tableHandle.getEnforcedPartitionConstraint().isNone() || tableHandle.getNonPartitionConstraint().isNone() || constraint.getSummary().isNone()) {
return createZeroStatistics(columns);
}
Set<String> predicatedColumnNames = tableHandle.getNonPartitionConstraint().getDomains().orElseThrow().keySet().stream().map(DeltaLakeColumnHandle::getName).collect(toImmutableSet());
List<ColumnMetadata> predicatedColumns = columnMetadata.stream().filter(column -> predicatedColumnNames.contains(column.getName())).collect(toImmutableList());
for (AddFileEntry addEntry : transactionLogAccess.getActiveFiles(tableSnapshot, session)) {
Optional<? extends DeltaLakeFileStatistics> fileStatistics = addEntry.getStats();
if (fileStatistics.isEmpty()) {
// Open source Delta Lake does not collect stats
return TableStatistics.empty();
}
DeltaLakeFileStatistics stats = fileStatistics.get();
if (!partitionMatchesPredicate(addEntry.getCanonicalPartitionValues(), tableHandle.getEnforcedPartitionConstraint().getDomains().orElseThrow())) {
continue;
}
TupleDomain<DeltaLakeColumnHandle> statisticsPredicate = createStatisticsPredicate(addEntry, predicatedColumns, tableHandle.getMetadataEntry().getCanonicalPartitionColumns());
if (!tableHandle.getNonPartitionConstraint().overlaps(statisticsPredicate)) {
continue;
}
if (stats.getNumRecords().isEmpty()) {
// Not clear if it's possible for stats to be present with no row count, but bail out if that happens
return TableStatistics.empty();
}
numRecords += stats.getNumRecords().get();
for (DeltaLakeColumnHandle column : columns) {
if (column.getColumnType() == PARTITION_KEY) {
Optional<String> partitionValue = addEntry.getCanonicalPartitionValues().get(column.getName());
if (partitionValue.isEmpty()) {
nullCounts.merge(column, (double) stats.getNumRecords().get(), Double::sum);
} else {
// NULL is not counted as a distinct value
// Code below assumes that values returned by addEntry.getCanonicalPartitionValues() are normalized,
// it may not be true in case of real, doubles, timestamps etc
partitioningColumnsDistinctValues.get(column).add(partitionValue.get());
}
} else {
Optional<Long> maybeNullCount = stats.getNullCount(column.getName());
if (maybeNullCount.isPresent()) {
nullCounts.put(column, nullCounts.get(column) + maybeNullCount.get());
} else {
// If any individual file fails to report null counts, fail to calculate the total for the table
nullCounts.put(column, NaN);
}
}
// Math.min returns NaN if any operand is NaN
stats.getMinColumnValue(column).map(parsedValue -> toStatsRepresentation(column.getType(), parsedValue)).filter(OptionalDouble::isPresent).map(OptionalDouble::getAsDouble).ifPresent(parsedValueAsDouble -> minValues.merge(column, parsedValueAsDouble, Math::min));
stats.getMaxColumnValue(column).map(parsedValue -> toStatsRepresentation(column.getType(), parsedValue)).filter(OptionalDouble::isPresent).map(OptionalDouble::getAsDouble).ifPresent(parsedValueAsDouble -> maxValues.merge(column, parsedValueAsDouble, Math::max));
}
}
if (numRecords == 0) {
return createZeroStatistics(columns);
}
TableStatistics.Builder statsBuilder = new TableStatistics.Builder().setRowCount(Estimate.of(numRecords));
Optional<DeltaLakeStatistics> statistics = Optional.empty();
if (isExtendedStatisticsEnabled(session)) {
statistics = statisticsAccess.readDeltaLakeStatistics(session, tableHandle.getLocation());
}
for (DeltaLakeColumnHandle column : columns) {
ColumnStatistics.Builder columnStatsBuilder = new ColumnStatistics.Builder();
Double nullCount = nullCounts.get(column);
columnStatsBuilder.setNullsFraction(nullCount.isNaN() ? Estimate.unknown() : Estimate.of(nullCount / numRecords));
Double maxValue = maxValues.get(column);
Double minValue = minValues.get(column);
if (isValidInRange(maxValue) && isValidInRange(minValue)) {
columnStatsBuilder.setRange(new DoubleRange(minValue, maxValue));
} else if (isValidInRange(maxValue)) {
columnStatsBuilder.setRange(new DoubleRange(NEGATIVE_INFINITY, maxValue));
} else if (isValidInRange(minValue)) {
columnStatsBuilder.setRange(new DoubleRange(minValue, POSITIVE_INFINITY));
}
// extend statistics with NDV
if (column.getColumnType() == PARTITION_KEY) {
columnStatsBuilder.setDistinctValuesCount(Estimate.of(partitioningColumnsDistinctValues.get(column).size()));
}
if (statistics.isPresent()) {
DeltaLakeColumnStatistics deltaLakeColumnStatistics = statistics.get().getColumnStatistics().get(column.getName());
if (deltaLakeColumnStatistics != null && column.getColumnType() != PARTITION_KEY) {
columnStatsBuilder.setDistinctValuesCount(Estimate.of(deltaLakeColumnStatistics.getNdvSummary().cardinality()));
}
}
statsBuilder.setColumnStatistics(column, columnStatsBuilder.build());
}
return statsBuilder.build();
}
Aggregations