use of org.apache.iceberg.Metrics in project presto by prestodb.
the class TestMetricsWrapper method testRoundTrip.
@Test
public void testRoundTrip() {
Long recordCount = 123L;
Map<Integer, Long> columnSizes = ImmutableMap.of(3, 321L, 5, 543L);
Map<Integer, Long> valueCounts = ImmutableMap.of(7, 765L, 9, 987L);
Map<Integer, Long> nullValueCounts = ImmutableMap.of(2, 234L, 4, 456L);
Map<Integer, ByteBuffer> lowerBounds = ImmutableMap.of(13, ByteBuffer.wrap(new byte[] { 0, 8, 9 }));
Map<Integer, ByteBuffer> upperBounds = ImmutableMap.of(17, ByteBuffer.wrap(new byte[] { 5, 4, 0 }));
Metrics expected = new Metrics(recordCount, columnSizes, valueCounts, nullValueCounts, lowerBounds, upperBounds);
Metrics actual = CODEC.fromJson(CODEC.toJson(new MetricsWrapper(expected))).metrics();
assertEquals(actual.recordCount(), recordCount);
assertEquals(actual.columnSizes(), columnSizes);
assertEquals(actual.valueCounts(), valueCounts);
assertEquals(actual.nullValueCounts(), nullValueCounts);
assertEquals(actual.lowerBounds(), lowerBounds);
assertEquals(actual.upperBounds(), upperBounds);
}
use of org.apache.iceberg.Metrics in project presto by prestodb.
the class IcebergOrcFileWriter method computeMetrics.
private static Metrics computeMetrics(Schema icebergSchema, List<OrcType> orcRowTypes, long fileRowCount, List<ColumnStatistics> columnStatistics) {
if (columnStatistics.isEmpty()) {
return new Metrics(fileRowCount, null, null, null, null, null);
}
// Columns that are descendants of LIST or MAP types are excluded because:
// 1. Their stats are not used by Apache Iceberg to filter out data files
// 2. Their record count can be larger than table-level row count. There's no good way to calculate nullCounts for them.
// See https://github.com/apache/iceberg/pull/199#discussion_r429443627
Set<Integer> excludedColumns = getExcludedColumns(orcRowTypes);
ImmutableMap.Builder<Integer, Long> valueCountsBuilder = ImmutableMap.builder();
ImmutableMap.Builder<Integer, Long> nullCountsBuilder = ImmutableMap.builder();
ImmutableMap.Builder<Integer, ByteBuffer> lowerBoundsBuilder = ImmutableMap.builder();
ImmutableMap.Builder<Integer, ByteBuffer> upperBoundsBuilder = ImmutableMap.builder();
// OrcColumnId(0) is the root column that represents file-level schema
for (int i = 1; i < orcRowTypes.size(); i++) {
if (excludedColumns.contains(i)) {
continue;
}
OrcType orcColumn = orcRowTypes.get(i);
ColumnStatistics orcColumnStats = columnStatistics.get(i);
int icebergId = getIcebergId(orcColumn);
NestedField icebergField = icebergSchema.findField(icebergId);
verify(icebergField != null, "Cannot find Iceberg column with ID %s in schema %s", icebergId, icebergSchema);
valueCountsBuilder.put(icebergId, fileRowCount);
if (orcColumnStats.hasNumberOfValues()) {
nullCountsBuilder.put(icebergId, fileRowCount - orcColumnStats.getNumberOfValues());
}
toIcebergMinMax(orcColumnStats, icebergField.type()).ifPresent(minMax -> {
lowerBoundsBuilder.put(icebergId, minMax.getMin());
upperBoundsBuilder.put(icebergId, minMax.getMax());
});
}
Map<Integer, Long> valueCounts = valueCountsBuilder.build();
Map<Integer, Long> nullCounts = nullCountsBuilder.build();
Map<Integer, ByteBuffer> lowerBounds = lowerBoundsBuilder.build();
Map<Integer, ByteBuffer> upperBounds = upperBoundsBuilder.build();
return new Metrics(fileRowCount, // TODO: Add column size accounting to ORC column writers
null, valueCounts.isEmpty() ? null : valueCounts, nullCounts.isEmpty() ? null : nullCounts, lowerBounds.isEmpty() ? null : lowerBounds, upperBounds.isEmpty() ? null : upperBounds);
}
Aggregations