Search in sources :

Example 56 with ColumnStatistics

use of com.facebook.presto.orc.metadata.statistics.ColumnStatistics in project urban-eureka by errir503.

the class AggregatedOrcPageSource method writeMinMax.

private void writeMinMax(int columnIndex, Type type, HiveType hiveType, BlockBuilder blockBuilder, boolean isMin) {
    ColumnStatistics columnStatistics = footer.getFileStats().get(columnIndex + 1);
    OrcType orcType = footer.getTypes().get(columnIndex + 1);
    if (type instanceof FixedWidthType) {
        completedBytes += ((FixedWidthType) type).getFixedSize();
    }
    String orcNoMinMaxMessage = "No min/max found for orc file. Set session property hive.pushdown_partial_aggregations_into_scan=false and execute query again";
    switch(orcType.getOrcTypeKind()) {
        case SHORT:
        case INT:
        case LONG:
            {
                Long value = isMin ? columnStatistics.getIntegerStatistics().getMin() : columnStatistics.getIntegerStatistics().getMax();
                if (value == null) {
                    throw new UnsupportedOperationException(orcNoMinMaxMessage);
                } else {
                    blockBuilder.writeLong(value);
                }
                break;
            }
        case TIMESTAMP:
        case DATE:
            {
                Integer value = isMin ? columnStatistics.getDateStatistics().getMin() : columnStatistics.getDateStatistics().getMax();
                if (value == null) {
                    throw new UnsupportedOperationException(orcNoMinMaxMessage);
                } else {
                    blockBuilder.writeLong(Long.valueOf(value));
                }
                break;
            }
        case VARCHAR:
        case CHAR:
        case STRING:
            {
                Slice value = isMin ? columnStatistics.getStringStatistics().getMin() : columnStatistics.getStringStatistics().getMax();
                if (value == null) {
                    throw new UnsupportedOperationException(orcNoMinMaxMessage);
                } else {
                    blockBuilder.writeBytes(value, 0, value.length()).closeEntry();
                    completedBytes += value.length();
                }
                break;
            }
        case FLOAT:
            {
                Double value = isMin ? columnStatistics.getDoubleStatistics().getMin() : columnStatistics.getDoubleStatistics().getMax();
                if (value == null) {
                    throw new UnsupportedOperationException(orcNoMinMaxMessage);
                } else {
                    blockBuilder.writeLong(floatToRawIntBits(value.floatValue()));
                }
                break;
            }
        case DOUBLE:
            {
                Double value = isMin ? columnStatistics.getDoubleStatistics().getMin() : columnStatistics.getDoubleStatistics().getMax();
                if (value == null) {
                    throw new UnsupportedOperationException(orcNoMinMaxMessage);
                } else {
                    type.writeDouble(blockBuilder, value);
                }
                break;
            }
        case DECIMAL:
            BigDecimal value = isMin ? columnStatistics.getDecimalStatistics().getMin() : columnStatistics.getDecimalStatistics().getMax();
            if (value == null) {
                throw new UnsupportedOperationException(orcNoMinMaxMessage);
            } else {
                Type definedType = hiveType.getType(typeManager);
                if (Decimals.isShortDecimal(definedType)) {
                    blockBuilder.writeLong(value.unscaledValue().longValue());
                } else {
                    type.writeSlice(blockBuilder, Decimals.encodeUnscaledValue(value.unscaledValue()));
                }
            }
            break;
        case BYTE:
        case BOOLEAN:
        case BINARY:
        case UNION:
        case LIST:
        case STRUCT:
        case MAP:
        default:
            throw new IllegalArgumentException("Unsupported type: " + orcType.getOrcTypeKind());
    }
}
Also used : ColumnStatistics(com.facebook.presto.orc.metadata.statistics.ColumnStatistics) HiveType(com.facebook.presto.hive.HiveType) FixedWidthType(com.facebook.presto.common.type.FixedWidthType) OrcType(com.facebook.presto.orc.metadata.OrcType) Type(com.facebook.presto.common.type.Type) OrcType(com.facebook.presto.orc.metadata.OrcType) Slice(io.airlift.slice.Slice) BigDecimal(java.math.BigDecimal) FixedWidthType(com.facebook.presto.common.type.FixedWidthType)

Example 57 with ColumnStatistics

use of com.facebook.presto.orc.metadata.statistics.ColumnStatistics in project urban-eureka by errir503.

the class IcebergOrcFileWriter method computeMetrics.

private static Metrics computeMetrics(Schema icebergSchema, List<OrcType> orcRowTypes, long fileRowCount, List<ColumnStatistics> columnStatistics) {
    if (columnStatistics.isEmpty()) {
        return new Metrics(fileRowCount, null, null, null, null, null, null);
    }
    // Columns that are descendants of LIST or MAP types are excluded because:
    // 1. Their stats are not used by Apache Iceberg to filter out data files
    // 2. Their record count can be larger than table-level row count. There's no good way to calculate nullCounts for them.
    // See https://github.com/apache/iceberg/pull/199#discussion_r429443627
    Set<Integer> excludedColumns = getExcludedColumns(orcRowTypes);
    ImmutableMap.Builder<Integer, Long> valueCountsBuilder = ImmutableMap.builder();
    ImmutableMap.Builder<Integer, Long> nullCountsBuilder = ImmutableMap.builder();
    ImmutableMap.Builder<Integer, ByteBuffer> lowerBoundsBuilder = ImmutableMap.builder();
    ImmutableMap.Builder<Integer, ByteBuffer> upperBoundsBuilder = ImmutableMap.builder();
    // OrcColumnId(0) is the root column that represents file-level schema
    for (int i = 1; i < orcRowTypes.size(); i++) {
        if (excludedColumns.contains(i)) {
            continue;
        }
        OrcType orcColumn = orcRowTypes.get(i);
        ColumnStatistics orcColumnStats = columnStatistics.get(i);
        int icebergId = getIcebergId(orcColumn);
        NestedField icebergField = icebergSchema.findField(icebergId);
        verify(icebergField != null, "Cannot find Iceberg column with ID %s in schema %s", icebergId, icebergSchema);
        valueCountsBuilder.put(icebergId, fileRowCount);
        if (orcColumnStats.hasNumberOfValues()) {
            nullCountsBuilder.put(icebergId, fileRowCount - orcColumnStats.getNumberOfValues());
        }
        toIcebergMinMax(orcColumnStats, icebergField.type()).ifPresent(minMax -> {
            lowerBoundsBuilder.put(icebergId, minMax.getMin());
            upperBoundsBuilder.put(icebergId, minMax.getMax());
        });
    }
    Map<Integer, Long> valueCounts = valueCountsBuilder.build();
    Map<Integer, Long> nullCounts = nullCountsBuilder.build();
    Map<Integer, ByteBuffer> lowerBounds = lowerBoundsBuilder.build();
    Map<Integer, ByteBuffer> upperBounds = upperBoundsBuilder.build();
    return new Metrics(fileRowCount, // TODO: Add column size accounting to ORC column writers
    null, valueCounts.isEmpty() ? null : valueCounts, nullCounts.isEmpty() ? null : nullCounts, null, lowerBounds.isEmpty() ? null : lowerBounds, upperBounds.isEmpty() ? null : upperBounds);
}
Also used : ColumnStatistics(com.facebook.presto.orc.metadata.statistics.ColumnStatistics) ByteBuffer(java.nio.ByteBuffer) ImmutableMap(com.google.common.collect.ImmutableMap) NestedField(org.apache.iceberg.types.Types.NestedField) Metrics(org.apache.iceberg.Metrics) OrcType(com.facebook.presto.orc.metadata.OrcType)

Example 58 with ColumnStatistics

use of com.facebook.presto.orc.metadata.statistics.ColumnStatistics in project urban-eureka by errir503.

the class TestStripeReader method testRowSize.

@Test
public void testRowSize() {
    int numberOfEntries = 10_000;
    long numRowsInGroup = MILLION;
    IntegerStatistics integerStatistics = new IntegerStatistics(0L, 0L, 0L);
    ColumnStatistics intColumnStatistics = new IntegerColumnStatistics(numRowsInGroup, null, integerStatistics);
    ColumnStatistics mapColumnStatistics = new ColumnStatistics(numRowsInGroup, null);
    ColumnStatistics mapKeyColumnStatistics = new IntegerColumnStatistics(numRowsInGroup * numberOfEntries, null, integerStatistics);
    ColumnStatistics mapValueColumnStatistics = new IntegerColumnStatistics(numRowsInGroup * numberOfEntries, null, integerStatistics);
    StreamId intStreamId = new StreamId(1, 0, Stream.StreamKind.ROW_INDEX);
    StreamId mapStreamId = new StreamId(2, 0, Stream.StreamKind.ROW_INDEX);
    StreamId mapKeyStreamId = new StreamId(3, 0, Stream.StreamKind.ROW_INDEX);
    StreamId mapValueStreamId = new StreamId(4, 0, Stream.StreamKind.ROW_INDEX);
    Map<StreamId, List<RowGroupIndex>> columnIndexes = ImmutableMap.of(intStreamId, createRowGroupIndex(intColumnStatistics), mapStreamId, createRowGroupIndex(mapColumnStatistics), mapKeyStreamId, createRowGroupIndex(mapKeyColumnStatistics), mapValueStreamId, createRowGroupIndex(mapValueColumnStatistics));
    // Each row contains 1 integer, 2 * numberOfEntries * integer (2 is for key and value).
    long expectedRowSize = INTEGER_VALUE_BYTES + 2 * numberOfEntries * INTEGER_VALUE_BYTES;
    RowGroup rowGroup = StripeReader.createRowGroup(0, Long.MAX_VALUE, numRowsInGroup, columnIndexes, ImmutableMap.of(), ImmutableMap.of());
    assertEquals(expectedRowSize, rowGroup.getMinAverageRowBytes());
}
Also used : IntegerColumnStatistics(com.facebook.presto.orc.metadata.statistics.IntegerColumnStatistics) ColumnStatistics(com.facebook.presto.orc.metadata.statistics.ColumnStatistics) IntegerColumnStatistics(com.facebook.presto.orc.metadata.statistics.IntegerColumnStatistics) List(java.util.List) ImmutableList(com.google.common.collect.ImmutableList) IntegerStatistics(com.facebook.presto.orc.metadata.statistics.IntegerStatistics) Test(org.testng.annotations.Test)

Example 59 with ColumnStatistics

use of com.facebook.presto.orc.metadata.statistics.ColumnStatistics in project urban-eureka by errir503.

the class AbstractOrcRecordReader method close.

@Override
public void close() throws IOException {
    try (Closer closer = Closer.create()) {
        closer.register(orcDataSource);
        for (StreamReader column : streamReaders) {
            if (column != null) {
                closer.register(column::close);
            }
        }
    }
    rowGroups = null;
    if (writeChecksumBuilder.isPresent()) {
        OrcWriteValidation.WriteChecksum actualChecksum = writeChecksumBuilder.get().build();
        validateWrite(validation -> validation.getChecksum().getTotalRowCount() == actualChecksum.getTotalRowCount(), "Invalid row count");
        List<Long> columnHashes = actualChecksum.getColumnHashes();
        for (int i = 0; i < columnHashes.size(); i++) {
            int columnIndex = i;
            validateWrite(validation -> validation.getChecksum().getColumnHashes().get(columnIndex).equals(columnHashes.get(columnIndex)), "Invalid checksum for column %s", columnIndex);
        }
        validateWrite(validation -> validation.getChecksum().getStripeHash() == actualChecksum.getStripeHash(), "Invalid stripes checksum");
    }
    if (fileStatisticsValidation.isPresent()) {
        List<ColumnStatistics> columnStatistics = fileStatisticsValidation.get().build();
        writeValidation.get().validateFileStatistics(orcDataSource.getId(), columnStatistics);
    }
}
Also used : Closer(com.google.common.io.Closer) ColumnStatistics(com.facebook.presto.orc.metadata.statistics.ColumnStatistics) StreamReader(com.facebook.presto.orc.reader.StreamReader) Comparator.comparingLong(java.util.Comparator.comparingLong)

Example 60 with ColumnStatistics

use of com.facebook.presto.orc.metadata.statistics.ColumnStatistics in project urban-eureka by errir503.

the class OrcWriteValidation method validateRowGroupStatistics.

public void validateRowGroupStatistics(OrcDataSourceId orcDataSourceId, long stripeOffset, Map<StreamId, List<RowGroupIndex>> actualRowGroupStatistics) throws OrcCorruptionException {
    requireNonNull(actualRowGroupStatistics, "actualRowGroupStatistics is null");
    List<RowGroupStatistics> expectedRowGroupStatistics = rowGroupStatistics.get(stripeOffset);
    if (expectedRowGroupStatistics == null) {
        throw new OrcCorruptionException(orcDataSourceId, "Unexpected stripe at offset %s", stripeOffset);
    }
    int rowGroupCount = expectedRowGroupStatistics.size();
    for (Entry<StreamId, List<RowGroupIndex>> entry : actualRowGroupStatistics.entrySet()) {
        if (entry.getValue().size() != rowGroupCount) {
            throw new OrcCorruptionException(orcDataSourceId, "Unexpected row group count stripe in at offset %s", stripeOffset);
        }
    }
    for (int rowGroupIndex = 0; rowGroupIndex < expectedRowGroupStatistics.size(); rowGroupIndex++) {
        RowGroupStatistics expectedRowGroup = expectedRowGroupStatistics.get(rowGroupIndex);
        if (expectedRowGroup.getValidationMode() != HASHED) {
            Map<Integer, ColumnStatistics> expectedStatistics = expectedRowGroup.getColumnStatistics();
            Set<Integer> actualColumns = actualRowGroupStatistics.keySet().stream().map(StreamId::getColumn).collect(Collectors.toSet());
            if (!expectedStatistics.keySet().equals(actualColumns)) {
                throw new OrcCorruptionException(orcDataSourceId, "Unexpected column in row group %s in stripe at offset %s", rowGroupIndex, stripeOffset);
            }
            for (Entry<StreamId, List<RowGroupIndex>> entry : actualRowGroupStatistics.entrySet()) {
                ColumnStatistics actual = entry.getValue().get(rowGroupIndex).getColumnStatistics();
                ColumnStatistics expected = expectedStatistics.get(entry.getKey().getColumn());
                validateColumnStatisticsEquivalent(orcDataSourceId, "Row group " + rowGroupIndex + " in stripe at offset " + stripeOffset, actual, expected);
            }
        }
        if (expectedRowGroup.getValidationMode() != DETAILED) {
            RowGroupStatistics actualRowGroup = buildActualRowGroupStatistics(rowGroupIndex, actualRowGroupStatistics);
            if (expectedRowGroup.getHash() != actualRowGroup.getHash()) {
                throw new OrcCorruptionException(orcDataSourceId, "Checksum mismatch for row group %s in stripe at offset %s", rowGroupIndex, stripeOffset);
            }
        }
    }
}
Also used : ColumnStatistics(com.facebook.presto.orc.metadata.statistics.ColumnStatistics) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) List(java.util.List) ArrayList(java.util.ArrayList) ImmutableList(com.google.common.collect.ImmutableList)

Aggregations

ColumnStatistics (com.facebook.presto.orc.metadata.statistics.ColumnStatistics)99 ImmutableList (com.google.common.collect.ImmutableList)46 Slice (io.airlift.slice.Slice)46 List (java.util.List)46 Stream (com.facebook.presto.orc.metadata.Stream)38 ArrayList (java.util.ArrayList)38 RowGroupIndex (com.facebook.presto.orc.metadata.RowGroupIndex)32 StreamDataOutput (com.facebook.presto.orc.stream.StreamDataOutput)32 BooleanStreamCheckpoint (com.facebook.presto.orc.checkpoint.BooleanStreamCheckpoint)26 PresentOutputStream (com.facebook.presto.orc.stream.PresentOutputStream)26 ImmutableMap (com.google.common.collect.ImmutableMap)23 LongOutputStream (com.facebook.presto.orc.stream.LongOutputStream)16 OrcType (com.facebook.presto.orc.metadata.OrcType)15 LongStreamCheckpoint (com.facebook.presto.orc.checkpoint.LongStreamCheckpoint)14 Map (java.util.Map)14 Type (com.facebook.presto.common.type.Type)13 ImmutableList.toImmutableList (com.google.common.collect.ImmutableList.toImmutableList)12 IOException (java.io.IOException)12 HashMap (java.util.HashMap)12 Optional (java.util.Optional)12