use of io.trino.orc.metadata.statistics.ColumnStatistics in project trino by trinodb.
the class TupleDomainOrcPredicate method getDomain.
@VisibleForTesting
public static Domain getDomain(Type type, long rowCount, ColumnStatistics columnStatistics) {
if (rowCount == 0) {
return Domain.none(type);
}
if (columnStatistics == null) {
return Domain.all(type);
}
if (columnStatistics.hasNumberOfValues() && columnStatistics.getNumberOfValues() == 0) {
return Domain.onlyNull(type);
}
boolean hasNullValue = columnStatistics.getNumberOfValues() != rowCount;
if (type instanceof TimeType && columnStatistics.getIntegerStatistics() != null) {
// This is the representation of TIME used by Iceberg
return createDomain(type, hasNullValue, columnStatistics.getIntegerStatistics(), value -> ((long) value) * Timestamps.PICOSECONDS_PER_MICROSECOND);
}
if (type.getJavaType() == boolean.class && columnStatistics.getBooleanStatistics() != null) {
BooleanStatistics booleanStatistics = columnStatistics.getBooleanStatistics();
boolean hasTrueValues = (booleanStatistics.getTrueValueCount() != 0);
boolean hasFalseValues = (columnStatistics.getNumberOfValues() != booleanStatistics.getTrueValueCount());
if (hasTrueValues && hasFalseValues) {
return Domain.all(BOOLEAN);
}
if (hasTrueValues) {
return Domain.create(ValueSet.of(BOOLEAN, true), hasNullValue);
}
if (hasFalseValues) {
return Domain.create(ValueSet.of(BOOLEAN, false), hasNullValue);
}
} else if (isShortDecimal(type) && columnStatistics.getDecimalStatistics() != null) {
return createDomain(type, hasNullValue, columnStatistics.getDecimalStatistics(), value -> rescale(value, (DecimalType) type).unscaledValue().longValue());
} else if (isLongDecimal(type) && columnStatistics.getDecimalStatistics() != null) {
return createDomain(type, hasNullValue, columnStatistics.getDecimalStatistics(), value -> Int128.valueOf(rescale(value, (DecimalType) type).unscaledValue()));
} else if (type instanceof CharType && columnStatistics.getStringStatistics() != null) {
return createDomain(type, hasNullValue, columnStatistics.getStringStatistics(), value -> truncateToLengthAndTrimSpaces(value, type));
} else if (type instanceof VarcharType && columnStatistics.getStringStatistics() != null) {
return createDomain(type, hasNullValue, columnStatistics.getStringStatistics());
} else if (type instanceof DateType && columnStatistics.getDateStatistics() != null) {
return createDomain(type, hasNullValue, columnStatistics.getDateStatistics(), value -> (long) value);
} else if ((type.equals(TIMESTAMP_MILLIS) || type.equals(TIMESTAMP_MICROS)) && columnStatistics.getTimestampStatistics() != null) {
// upper bound of the domain we create must be adjusted accordingly, to includes the rounded timestamp.
return createDomain(type, hasNullValue, columnStatistics.getTimestampStatistics(), min -> min * MICROSECONDS_PER_MILLISECOND, max -> (max + 1) * MICROSECONDS_PER_MILLISECOND);
} else if (type.equals(TIMESTAMP_NANOS) && columnStatistics.getTimestampStatistics() != null) {
return createDomain(type, hasNullValue, columnStatistics.getTimestampStatistics(), min -> new LongTimestamp(min * MICROSECONDS_PER_MILLISECOND, 0), max -> new LongTimestamp((max + 1) * MICROSECONDS_PER_MILLISECOND, 0));
} else if (type.equals(TIMESTAMP_TZ_MILLIS) && columnStatistics.getTimestampStatistics() != null) {
return createDomain(type, hasNullValue, columnStatistics.getTimestampStatistics(), value -> packDateTimeWithZone(value, UTC_KEY));
} else if (type.equals(TIMESTAMP_TZ_MICROS) && (columnStatistics.getTimestampStatistics() != null)) {
return createDomain(type, hasNullValue, columnStatistics.getTimestampStatistics(), min -> LongTimestampWithTimeZone.fromEpochMillisAndFraction(min, 0, UTC_KEY), max -> LongTimestampWithTimeZone.fromEpochMillisAndFraction(max, 999_000_000, UTC_KEY));
} else if (type.equals(TIMESTAMP_TZ_NANOS) && columnStatistics.getTimestampStatistics() != null) {
return createDomain(type, hasNullValue, columnStatistics.getTimestampStatistics(), min -> LongTimestampWithTimeZone.fromEpochMillisAndFraction(min, 0, UTC_KEY), max -> LongTimestampWithTimeZone.fromEpochMillisAndFraction(max, 999_999_000, UTC_KEY));
} else if (type.getJavaType() == long.class && columnStatistics.getIntegerStatistics() != null) {
return createDomain(type, hasNullValue, columnStatistics.getIntegerStatistics());
} else if (type.getJavaType() == double.class && columnStatistics.getDoubleStatistics() != null) {
return createDomain(type, hasNullValue, columnStatistics.getDoubleStatistics());
} else if (REAL.equals(type) && columnStatistics.getDoubleStatistics() != null) {
return createDomain(type, hasNullValue, columnStatistics.getDoubleStatistics(), value -> (long) floatToRawIntBits(value.floatValue()));
}
return Domain.create(ValueSet.all(type), hasNullValue);
}
use of io.trino.orc.metadata.statistics.ColumnStatistics in project trino by trinodb.
the class IcebergOrcFileWriter method computeMetrics.
private static Metrics computeMetrics(MetricsConfig metricsConfig, Schema icebergSchema, ColumnMetadata<OrcType> orcColumns, long fileRowCount, Optional<ColumnMetadata<ColumnStatistics>> columnStatistics) {
if (columnStatistics.isEmpty()) {
return new Metrics(fileRowCount, null, null, null, null, null, null);
}
// Columns that are descendants of LIST or MAP types are excluded because:
// 1. Their stats are not used by Apache Iceberg to filter out data files
// 2. Their record count can be larger than table-level row count. There's no good way to calculate nullCounts for them.
// See https://github.com/apache/iceberg/pull/199#discussion_r429443627
Set<OrcColumnId> excludedColumns = getExcludedColumns(orcColumns);
ImmutableMap.Builder<Integer, Long> valueCountsBuilder = ImmutableMap.builder();
ImmutableMap.Builder<Integer, Long> nullCountsBuilder = ImmutableMap.builder();
ImmutableMap.Builder<Integer, ByteBuffer> lowerBoundsBuilder = ImmutableMap.builder();
ImmutableMap.Builder<Integer, ByteBuffer> upperBoundsBuilder = ImmutableMap.builder();
// OrcColumnId(0) is the root column that represents file-level schema
for (int i = 1; i < orcColumns.size(); i++) {
OrcColumnId orcColumnId = new OrcColumnId(i);
if (excludedColumns.contains(orcColumnId)) {
continue;
}
OrcType orcColumn = orcColumns.get(orcColumnId);
ColumnStatistics orcColumnStats = columnStatistics.get().get(orcColumnId);
int icebergId = getIcebergId(orcColumn);
Types.NestedField icebergField = icebergSchema.findField(icebergId);
MetricsModes.MetricsMode metricsMode = MetricsUtil.metricsMode(icebergSchema, metricsConfig, icebergId);
if (metricsMode.equals(MetricsModes.None.get())) {
continue;
}
verify(icebergField != null, "Cannot find Iceberg column with ID %s in schema %s", icebergId, icebergSchema);
valueCountsBuilder.put(icebergId, fileRowCount);
if (orcColumnStats.hasNumberOfValues()) {
nullCountsBuilder.put(icebergId, fileRowCount - orcColumnStats.getNumberOfValues());
}
if (!metricsMode.equals(MetricsModes.Counts.get())) {
toIcebergMinMax(orcColumnStats, icebergField.type(), metricsMode).ifPresent(minMax -> {
lowerBoundsBuilder.put(icebergId, minMax.getMin());
upperBoundsBuilder.put(icebergId, minMax.getMax());
});
}
}
Map<Integer, Long> valueCounts = valueCountsBuilder.buildOrThrow();
Map<Integer, Long> nullCounts = nullCountsBuilder.buildOrThrow();
Map<Integer, ByteBuffer> lowerBounds = lowerBoundsBuilder.buildOrThrow();
Map<Integer, ByteBuffer> upperBounds = upperBoundsBuilder.buildOrThrow();
return new Metrics(fileRowCount, // TODO: Add column size accounting to ORC column writers
null, valueCounts.isEmpty() ? null : valueCounts, nullCounts.isEmpty() ? null : nullCounts, // TODO: Add nanValueCounts to ORC writer
null, lowerBounds.isEmpty() ? null : lowerBounds, upperBounds.isEmpty() ? null : upperBounds);
}
use of io.trino.orc.metadata.statistics.ColumnStatistics in project trino by trinodb.
the class DecimalColumnWriter method getIndexStreams.
@Override
public List<StreamDataOutput> getIndexStreams(CompressedMetadataWriter metadataWriter) throws IOException {
checkState(closed);
ImmutableList.Builder<RowGroupIndex> rowGroupIndexes = ImmutableList.builder();
List<DecimalStreamCheckpoint> dataCheckpoints = dataStream.getCheckpoints();
List<LongStreamCheckpoint> scaleCheckpoints = scaleStream.getCheckpoints();
Optional<List<BooleanStreamCheckpoint>> presentCheckpoints = presentStream.getCheckpoints();
for (int i = 0; i < rowGroupColumnStatistics.size(); i++) {
int groupId = i;
ColumnStatistics columnStatistics = rowGroupColumnStatistics.get(groupId);
DecimalStreamCheckpoint dataCheckpoint = dataCheckpoints.get(groupId);
LongStreamCheckpoint scaleCheckpoint = scaleCheckpoints.get(groupId);
Optional<BooleanStreamCheckpoint> presentCheckpoint = presentCheckpoints.map(checkpoints -> checkpoints.get(groupId));
List<Integer> positions = createDecimalColumnPositionList(compressed, dataCheckpoint, scaleCheckpoint, presentCheckpoint);
rowGroupIndexes.add(new RowGroupIndex(positions, columnStatistics));
}
Slice slice = metadataWriter.writeRowIndexes(rowGroupIndexes.build());
Stream stream = new Stream(columnId, StreamKind.ROW_INDEX, slice.length(), false);
return ImmutableList.of(new StreamDataOutput(slice, stream));
}
use of io.trino.orc.metadata.statistics.ColumnStatistics in project trino by trinodb.
the class DoubleColumnWriter method getIndexStreams.
@Override
public List<StreamDataOutput> getIndexStreams(CompressedMetadataWriter metadataWriter) throws IOException {
checkState(closed);
ImmutableList.Builder<RowGroupIndex> rowGroupIndexes = ImmutableList.builder();
List<DoubleStreamCheckpoint> dataCheckpoints = dataStream.getCheckpoints();
Optional<List<BooleanStreamCheckpoint>> presentCheckpoints = presentStream.getCheckpoints();
for (int i = 0; i < rowGroupColumnStatistics.size(); i++) {
int groupId = i;
ColumnStatistics columnStatistics = rowGroupColumnStatistics.get(groupId);
DoubleStreamCheckpoint dataCheckpoint = dataCheckpoints.get(groupId);
Optional<BooleanStreamCheckpoint> presentCheckpoint = presentCheckpoints.map(checkpoints -> checkpoints.get(groupId));
List<Integer> positions = createDoubleColumnPositionList(compressed, dataCheckpoint, presentCheckpoint);
rowGroupIndexes.add(new RowGroupIndex(positions, columnStatistics));
}
Slice slice = metadataWriter.writeRowIndexes(rowGroupIndexes.build());
Stream stream = new Stream(columnId, StreamKind.ROW_INDEX, slice.length(), false);
return ImmutableList.of(new StreamDataOutput(slice, stream));
}
use of io.trino.orc.metadata.statistics.ColumnStatistics in project trino by trinodb.
the class DoubleColumnWriter method finishRowGroup.
@Override
public Map<OrcColumnId, ColumnStatistics> finishRowGroup() {
checkState(!closed);
ColumnStatistics statistics = statisticsBuilder.buildColumnStatistics();
rowGroupColumnStatistics.add(statistics);
statisticsBuilder = statisticsBuilderSupplier.get();
return ImmutableMap.of(columnId, statistics);
}
Aggregations