use of io.trino.plugin.deltalake.DeltaLakeColumnType.REGULAR in project trino by trinodb.
the class TestDeltaLakeSchemaSupport method testRoundTripComplexSchema.
@Test
public void testRoundTripComplexSchema() throws IOException, URISyntaxException {
URL expected = getResource("io/trino/plugin/deltalake/transactionlog/schema/complex_schema.json");
String json = Files.readString(Path.of(expected.toURI()));
List<ColumnMetadata> schema = DeltaLakeSchemaSupport.getColumnMetadata(json, typeManager);
List<DeltaLakeColumnHandle> columnHandles = schema.stream().map(metadata -> new DeltaLakeColumnHandle(metadata.getName(), metadata.getType(), REGULAR)).collect(toImmutableList());
ObjectMapper objectMapper = new ObjectMapper();
assertEquals(objectMapper.readTree(serializeSchemaAsJson(columnHandles)), objectMapper.readTree(json));
}
use of io.trino.plugin.deltalake.DeltaLakeColumnType.REGULAR in project trino by trinodb.
the class TestTransactionLogAccess method testSnapshotsAreConsistent.
@Test
public void testSnapshotsAreConsistent() throws Exception {
String tableName = "person";
File tempDir = Files.createTempDir();
File tableDir = new File(tempDir, tableName);
File transactionLogDir = new File(tableDir, TRANSACTION_LOG_DIRECTORY);
transactionLogDir.mkdirs();
File resourceDir = new File(getClass().getClassLoader().getResource("databricks/person/_delta_log").toURI());
copyTransactionLogEntry(0, 12, resourceDir, transactionLogDir);
Files.copy(new File(resourceDir, LAST_CHECKPOINT_FILENAME), new File(transactionLogDir, LAST_CHECKPOINT_FILENAME));
setupTransactionLogAccess(tableName, new Path(tableDir.toURI()));
List<AddFileEntry> expectedDataFiles = transactionLogAccess.getActiveFiles(tableSnapshot, SESSION);
copyTransactionLogEntry(12, 14, resourceDir, transactionLogDir);
Set<String> newDataFiles = ImmutableSet.of("age=28/part-00000-40dd1707-1d42-4328-a59a-21f5c945fe60.c000.snappy.parquet", "age=29/part-00000-3794c463-cb0c-4beb-8d07-7cc1e3b5920f.c000.snappy.parquet");
TableSnapshot updatedTableSnapshot = transactionLogAccess.loadSnapshot(new SchemaTableName("schema", tableName), new Path(tableDir.toURI()), SESSION);
List<AddFileEntry> allDataFiles = transactionLogAccess.getActiveFiles(updatedTableSnapshot, SESSION);
List<AddFileEntry> dataFilesWithFixedVersion = transactionLogAccess.getActiveFiles(tableSnapshot, SESSION);
for (String newFilePath : newDataFiles) {
assertTrue(allDataFiles.stream().anyMatch(entry -> entry.getPath().equals(newFilePath)));
assertTrue(dataFilesWithFixedVersion.stream().noneMatch(entry -> entry.getPath().equals(newFilePath)));
}
assertEquals(expectedDataFiles.size(), dataFilesWithFixedVersion.size());
List<ColumnMetadata> columns = extractSchema(transactionLogAccess.getMetadataEntry(tableSnapshot, SESSION).get(), TESTING_TYPE_MANAGER);
for (int i = 0; i < expectedDataFiles.size(); i++) {
AddFileEntry expected = expectedDataFiles.get(i);
AddFileEntry actual = dataFilesWithFixedVersion.get(i);
assertEquals(expected.getPath(), actual.getPath());
assertEquals(expected.getPartitionValues(), actual.getPartitionValues());
assertEquals(expected.getSize(), actual.getSize());
assertEquals(expected.getModificationTime(), actual.getModificationTime());
assertEquals(expected.isDataChange(), actual.isDataChange());
assertEquals(expected.getTags(), actual.getTags());
assertTrue(expected.getStats().isPresent());
assertTrue(actual.getStats().isPresent());
for (ColumnMetadata column : columns) {
DeltaLakeColumnHandle columnHandle = new DeltaLakeColumnHandle(column.getName(), column.getType(), REGULAR);
assertEquals(expected.getStats().get().getMinColumnValue(columnHandle), actual.getStats().get().getMinColumnValue(columnHandle));
assertEquals(expected.getStats().get().getMaxColumnValue(columnHandle), actual.getStats().get().getMaxColumnValue(columnHandle));
assertEquals(expected.getStats().get().getNullCount(columnHandle.getName()), actual.getStats().get().getNullCount(columnHandle.getName()));
assertEquals(expected.getStats().get().getNumRecords(), actual.getStats().get().getNumRecords());
}
}
}
use of io.trino.plugin.deltalake.DeltaLakeColumnType.REGULAR in project trino by trinodb.
the class HiveMetastoreBackedDeltaLakeMetastore method getTableStatistics.
@Override
public TableStatistics getTableStatistics(ConnectorSession session, DeltaLakeTableHandle tableHandle, Constraint constraint) {
TableSnapshot tableSnapshot = getSnapshot(tableHandle.getSchemaTableName(), session);
double numRecords = 0L;
MetadataEntry metadata = transactionLogAccess.getMetadataEntry(tableSnapshot, session).orElseThrow(() -> new TrinoException(DELTA_LAKE_INVALID_SCHEMA, "Metadata not found in transaction log for " + tableHandle.getTableName()));
List<ColumnMetadata> columnMetadata = DeltaLakeSchemaSupport.extractSchema(metadata, typeManager);
List<DeltaLakeColumnHandle> columns = columnMetadata.stream().map(columnMeta -> new DeltaLakeColumnHandle(columnMeta.getName(), columnMeta.getType(), metadata.getCanonicalPartitionColumns().contains(columnMeta.getName()) ? PARTITION_KEY : REGULAR)).collect(toImmutableList());
Map<DeltaLakeColumnHandle, Double> nullCounts = new HashMap<>();
columns.forEach(column -> nullCounts.put(column, 0.0));
Map<DeltaLakeColumnHandle, Double> minValues = new HashMap<>();
Map<DeltaLakeColumnHandle, Double> maxValues = new HashMap<>();
Map<DeltaLakeColumnHandle, Set<String>> partitioningColumnsDistinctValues = new HashMap<>();
columns.stream().filter(column -> column.getColumnType() == PARTITION_KEY).forEach(column -> partitioningColumnsDistinctValues.put(column, new HashSet<>()));
if (tableHandle.getEnforcedPartitionConstraint().isNone() || tableHandle.getNonPartitionConstraint().isNone() || constraint.getSummary().isNone()) {
return createZeroStatistics(columns);
}
Set<String> predicatedColumnNames = tableHandle.getNonPartitionConstraint().getDomains().orElseThrow().keySet().stream().map(DeltaLakeColumnHandle::getName).collect(toImmutableSet());
List<ColumnMetadata> predicatedColumns = columnMetadata.stream().filter(column -> predicatedColumnNames.contains(column.getName())).collect(toImmutableList());
for (AddFileEntry addEntry : transactionLogAccess.getActiveFiles(tableSnapshot, session)) {
Optional<? extends DeltaLakeFileStatistics> fileStatistics = addEntry.getStats();
if (fileStatistics.isEmpty()) {
// Open source Delta Lake does not collect stats
return TableStatistics.empty();
}
DeltaLakeFileStatistics stats = fileStatistics.get();
if (!partitionMatchesPredicate(addEntry.getCanonicalPartitionValues(), tableHandle.getEnforcedPartitionConstraint().getDomains().orElseThrow())) {
continue;
}
TupleDomain<DeltaLakeColumnHandle> statisticsPredicate = createStatisticsPredicate(addEntry, predicatedColumns, tableHandle.getMetadataEntry().getCanonicalPartitionColumns());
if (!tableHandle.getNonPartitionConstraint().overlaps(statisticsPredicate)) {
continue;
}
if (stats.getNumRecords().isEmpty()) {
// Not clear if it's possible for stats to be present with no row count, but bail out if that happens
return TableStatistics.empty();
}
numRecords += stats.getNumRecords().get();
for (DeltaLakeColumnHandle column : columns) {
if (column.getColumnType() == PARTITION_KEY) {
Optional<String> partitionValue = addEntry.getCanonicalPartitionValues().get(column.getName());
if (partitionValue.isEmpty()) {
nullCounts.merge(column, (double) stats.getNumRecords().get(), Double::sum);
} else {
// NULL is not counted as a distinct value
// Code below assumes that values returned by addEntry.getCanonicalPartitionValues() are normalized,
// it may not be true in case of real, doubles, timestamps etc
partitioningColumnsDistinctValues.get(column).add(partitionValue.get());
}
} else {
Optional<Long> maybeNullCount = stats.getNullCount(column.getName());
if (maybeNullCount.isPresent()) {
nullCounts.put(column, nullCounts.get(column) + maybeNullCount.get());
} else {
// If any individual file fails to report null counts, fail to calculate the total for the table
nullCounts.put(column, NaN);
}
}
// Math.min returns NaN if any operand is NaN
stats.getMinColumnValue(column).map(parsedValue -> toStatsRepresentation(column.getType(), parsedValue)).filter(OptionalDouble::isPresent).map(OptionalDouble::getAsDouble).ifPresent(parsedValueAsDouble -> minValues.merge(column, parsedValueAsDouble, Math::min));
stats.getMaxColumnValue(column).map(parsedValue -> toStatsRepresentation(column.getType(), parsedValue)).filter(OptionalDouble::isPresent).map(OptionalDouble::getAsDouble).ifPresent(parsedValueAsDouble -> maxValues.merge(column, parsedValueAsDouble, Math::max));
}
}
if (numRecords == 0) {
return createZeroStatistics(columns);
}
TableStatistics.Builder statsBuilder = new TableStatistics.Builder().setRowCount(Estimate.of(numRecords));
Optional<DeltaLakeStatistics> statistics = Optional.empty();
if (isExtendedStatisticsEnabled(session)) {
statistics = statisticsAccess.readDeltaLakeStatistics(session, tableHandle.getLocation());
}
for (DeltaLakeColumnHandle column : columns) {
ColumnStatistics.Builder columnStatsBuilder = new ColumnStatistics.Builder();
Double nullCount = nullCounts.get(column);
columnStatsBuilder.setNullsFraction(nullCount.isNaN() ? Estimate.unknown() : Estimate.of(nullCount / numRecords));
Double maxValue = maxValues.get(column);
Double minValue = minValues.get(column);
if (isValidInRange(maxValue) && isValidInRange(minValue)) {
columnStatsBuilder.setRange(new DoubleRange(minValue, maxValue));
} else if (isValidInRange(maxValue)) {
columnStatsBuilder.setRange(new DoubleRange(NEGATIVE_INFINITY, maxValue));
} else if (isValidInRange(minValue)) {
columnStatsBuilder.setRange(new DoubleRange(minValue, POSITIVE_INFINITY));
}
// extend statistics with NDV
if (column.getColumnType() == PARTITION_KEY) {
columnStatsBuilder.setDistinctValuesCount(Estimate.of(partitioningColumnsDistinctValues.get(column).size()));
}
if (statistics.isPresent()) {
DeltaLakeColumnStatistics deltaLakeColumnStatistics = statistics.get().getColumnStatistics().get(column.getName());
if (deltaLakeColumnStatistics != null && column.getColumnType() != PARTITION_KEY) {
columnStatsBuilder.setDistinctValuesCount(Estimate.of(deltaLakeColumnStatistics.getNdvSummary().cardinality()));
}
}
statsBuilder.setColumnStatistics(column, columnStatsBuilder.build());
}
return statsBuilder.build();
}
use of io.trino.plugin.deltalake.DeltaLakeColumnType.REGULAR in project trino by trinodb.
the class DeltaLakePageSourceProvider method createPageSource.
@Override
public ConnectorPageSource createPageSource(ConnectorTransactionHandle transaction, ConnectorSession session, ConnectorSplit connectorSplit, ConnectorTableHandle connectorTable, List<ColumnHandle> columns, DynamicFilter dynamicFilter) {
DeltaLakeSplit split = (DeltaLakeSplit) connectorSplit;
DeltaLakeTableHandle table = (DeltaLakeTableHandle) connectorTable;
// We reach here when we could not prune the split using file level stats, table predicate
// and the dynamic filter in the coordinator during split generation. The file level stats
// in DeltaLakeSplit#filePredicate could help to prune this split when a more selective dynamic filter
// is available now, without having to access parquet file footer for row-group stats.
// We avoid sending DeltaLakeSplit#splitPredicate to workers by using table.getPredicate() here.
TupleDomain<DeltaLakeColumnHandle> filteredSplitPredicate = TupleDomain.intersect(ImmutableList.of(table.getNonPartitionConstraint(), split.getStatisticsPredicate(), dynamicFilter.getCurrentPredicate().transformKeys(DeltaLakeColumnHandle.class::cast)));
if (filteredSplitPredicate.isNone()) {
return new EmptyPageSource();
}
List<DeltaLakeColumnHandle> deltaLakeColumns = columns.stream().map(DeltaLakeColumnHandle.class::cast).collect(toImmutableList());
Map<String, Optional<String>> partitionKeys = split.getPartitionKeys();
List<DeltaLakeColumnHandle> regularColumns = deltaLakeColumns.stream().filter(column -> column.getColumnType() == REGULAR).collect(toImmutableList());
List<HiveColumnHandle> hiveColumnHandles = regularColumns.stream().map(DeltaLakeColumnHandle::toHiveColumnHandle).collect(toImmutableList());
Path path = new Path(split.getPath());
HdfsContext hdfsContext = new HdfsContext(session);
TupleDomain<HiveColumnHandle> parquetPredicate = getParquetTupleDomain(filteredSplitPredicate.simplify(domainCompactionThreshold));
if (table.getWriteType().isPresent()) {
return new DeltaLakeUpdatablePageSource(table, deltaLakeColumns, partitionKeys, split.getPath(), split.getFileSize(), split.getFileModifiedTime(), session, executorService, hdfsEnvironment, hdfsContext, parquetDateTimeZone, parquetReaderOptions, parquetPredicate, typeManager, updateResultJsonCodec);
}
ReaderPageSource pageSource = ParquetPageSourceFactory.createPageSource(path, split.getStart(), split.getLength(), split.getFileSize(), hiveColumnHandles, parquetPredicate, true, hdfsEnvironment, hdfsEnvironment.getConfiguration(hdfsContext, path), session.getIdentity(), parquetDateTimeZone, fileFormatDataSourceStats, parquetReaderOptions.withMaxReadBlockSize(getParquetMaxReadBlockSize(session)).withUseColumnIndex(isParquetUseColumnIndex(session)));
verify(pageSource.getReaderColumns().isEmpty(), "All columns expected to be base columns");
return new DeltaLakePageSource(deltaLakeColumns, partitionKeys, pageSource.get(), split.getPath(), split.getFileSize(), split.getFileModifiedTime());
}
use of io.trino.plugin.deltalake.DeltaLakeColumnType.REGULAR in project trino by trinodb.
the class TestDeltaLakeCreateTableStatistics method testTimestampMilliSingleRecord.
// int96 timestamp Statistics are only populated if a row group contains a single value
@Test
public void testTimestampMilliSingleRecord() throws IOException {
String columnName = "t_timestamp";
DeltaLakeColumnHandle columnHandle = new DeltaLakeColumnHandle(columnName, TIMESTAMP_TZ_MILLIS, REGULAR);
try (TestTable table = new TestTable("test_timestamp_single_record_", ImmutableList.of(columnName), "VALUES timestamp '2012-10-31 04:00:00.123 America/New_York', timestamp '2012-10-31 01:00:00.123 America/Los_Angeles', null")) {
List<AddFileEntry> addFileEntries = getAddFileEntries(table.getName());
AddFileEntry entry = getOnlyElement(addFileEntries);
assertThat(entry.getStats()).isPresent();
DeltaLakeFileStatistics fileStatistics = entry.getStats().get();
assertEquals(fileStatistics.getNumRecords(), Optional.of(3L));
Function<String, Long> timestampValueConverter = valueString -> {
ZonedDateTime zonedDateTime = ZonedDateTime.parse(valueString);
Instant instant = zonedDateTime.toInstant();
return packDateTimeWithZone(instant.toEpochMilli(), UTC_KEY);
};
assertEquals(fileStatistics.getMinColumnValue(columnHandle), Optional.of(timestampValueConverter.apply("2012-10-31T08:00:00.123Z")));
assertEquals(fileStatistics.getMaxColumnValue(columnHandle), Optional.of(timestampValueConverter.apply("2012-10-31T08:00:00.123Z")));
assertEquals(fileStatistics.getNullCount(columnName), Optional.of(1L));
}
}
Aggregations