Search in sources :

Example 1 with Metrics

use of org.apache.iceberg.Metrics in project presto by prestodb.

the class TestMetricsWrapper method testRoundTrip.

@Test
public void testRoundTrip() {
    Long recordCount = 123L;
    Map<Integer, Long> columnSizes = ImmutableMap.of(3, 321L, 5, 543L);
    Map<Integer, Long> valueCounts = ImmutableMap.of(7, 765L, 9, 987L);
    Map<Integer, Long> nullValueCounts = ImmutableMap.of(2, 234L, 4, 456L);
    Map<Integer, ByteBuffer> lowerBounds = ImmutableMap.of(13, ByteBuffer.wrap(new byte[] { 0, 8, 9 }));
    Map<Integer, ByteBuffer> upperBounds = ImmutableMap.of(17, ByteBuffer.wrap(new byte[] { 5, 4, 0 }));
    Metrics expected = new Metrics(recordCount, columnSizes, valueCounts, nullValueCounts, lowerBounds, upperBounds);
    Metrics actual = CODEC.fromJson(CODEC.toJson(new MetricsWrapper(expected))).metrics();
    assertEquals(actual.recordCount(), recordCount);
    assertEquals(actual.columnSizes(), columnSizes);
    assertEquals(actual.valueCounts(), valueCounts);
    assertEquals(actual.nullValueCounts(), nullValueCounts);
    assertEquals(actual.lowerBounds(), lowerBounds);
    assertEquals(actual.upperBounds(), upperBounds);
}
Also used : Metrics(org.apache.iceberg.Metrics) ByteBuffer(java.nio.ByteBuffer) Test(org.testng.annotations.Test)

Example 2 with Metrics

use of org.apache.iceberg.Metrics in project presto by prestodb.

the class IcebergOrcFileWriter method computeMetrics.

private static Metrics computeMetrics(Schema icebergSchema, List<OrcType> orcRowTypes, long fileRowCount, List<ColumnStatistics> columnStatistics) {
    if (columnStatistics.isEmpty()) {
        return new Metrics(fileRowCount, null, null, null, null, null);
    }
    // Columns that are descendants of LIST or MAP types are excluded because:
    // 1. Their stats are not used by Apache Iceberg to filter out data files
    // 2. Their record count can be larger than table-level row count. There's no good way to calculate nullCounts for them.
    // See https://github.com/apache/iceberg/pull/199#discussion_r429443627
    Set<Integer> excludedColumns = getExcludedColumns(orcRowTypes);
    ImmutableMap.Builder<Integer, Long> valueCountsBuilder = ImmutableMap.builder();
    ImmutableMap.Builder<Integer, Long> nullCountsBuilder = ImmutableMap.builder();
    ImmutableMap.Builder<Integer, ByteBuffer> lowerBoundsBuilder = ImmutableMap.builder();
    ImmutableMap.Builder<Integer, ByteBuffer> upperBoundsBuilder = ImmutableMap.builder();
    // OrcColumnId(0) is the root column that represents file-level schema
    for (int i = 1; i < orcRowTypes.size(); i++) {
        if (excludedColumns.contains(i)) {
            continue;
        }
        OrcType orcColumn = orcRowTypes.get(i);
        ColumnStatistics orcColumnStats = columnStatistics.get(i);
        int icebergId = getIcebergId(orcColumn);
        NestedField icebergField = icebergSchema.findField(icebergId);
        verify(icebergField != null, "Cannot find Iceberg column with ID %s in schema %s", icebergId, icebergSchema);
        valueCountsBuilder.put(icebergId, fileRowCount);
        if (orcColumnStats.hasNumberOfValues()) {
            nullCountsBuilder.put(icebergId, fileRowCount - orcColumnStats.getNumberOfValues());
        }
        toIcebergMinMax(orcColumnStats, icebergField.type()).ifPresent(minMax -> {
            lowerBoundsBuilder.put(icebergId, minMax.getMin());
            upperBoundsBuilder.put(icebergId, minMax.getMax());
        });
    }
    Map<Integer, Long> valueCounts = valueCountsBuilder.build();
    Map<Integer, Long> nullCounts = nullCountsBuilder.build();
    Map<Integer, ByteBuffer> lowerBounds = lowerBoundsBuilder.build();
    Map<Integer, ByteBuffer> upperBounds = upperBoundsBuilder.build();
    return new Metrics(fileRowCount, // TODO: Add column size accounting to ORC column writers
    null, valueCounts.isEmpty() ? null : valueCounts, nullCounts.isEmpty() ? null : nullCounts, lowerBounds.isEmpty() ? null : lowerBounds, upperBounds.isEmpty() ? null : upperBounds);
}
Also used : ColumnStatistics(com.facebook.presto.orc.metadata.statistics.ColumnStatistics) ByteBuffer(java.nio.ByteBuffer) ImmutableMap(com.google.common.collect.ImmutableMap) NestedField(org.apache.iceberg.types.Types.NestedField) Metrics(org.apache.iceberg.Metrics) OrcType(com.facebook.presto.orc.metadata.OrcType)

Aggregations

ByteBuffer (java.nio.ByteBuffer)2 Metrics (org.apache.iceberg.Metrics)2 OrcType (com.facebook.presto.orc.metadata.OrcType)1 ColumnStatistics (com.facebook.presto.orc.metadata.statistics.ColumnStatistics)1 ImmutableMap (com.google.common.collect.ImmutableMap)1 NestedField (org.apache.iceberg.types.Types.NestedField)1 Test (org.testng.annotations.Test)1