Examples with ColumnStatistics - com.facebook.presto.orc.metadata.statistics.ColumnStatistics

Example 6 with ColumnStatistics

use of com.facebook.presto.orc.metadata.statistics.ColumnStatistics in project presto by prestodb.

the class ByteColumnWriter method getIndexStreams.

@Override
public List<StreamDataOutput> getIndexStreams() throws IOException {
    checkState(closed);
    ImmutableList.Builder<RowGroupIndex> rowGroupIndexes = ImmutableList.builder();
    List<ByteStreamCheckpoint> dataCheckpoints = dataStream.getCheckpoints();
    Optional<List<BooleanStreamCheckpoint>> presentCheckpoints = presentStream.getCheckpoints();
    for (int i = 0; i < rowGroupColumnStatistics.size(); i++) {
        int groupId = i;
        ColumnStatistics columnStatistics = rowGroupColumnStatistics.get(groupId);
        ByteStreamCheckpoint dataCheckpoint = dataCheckpoints.get(groupId);
        Optional<BooleanStreamCheckpoint> presentCheckpoint = presentCheckpoints.map(checkpoints -> checkpoints.get(groupId));
        List<Integer> positions = createByteColumnPositionList(compressed, dataCheckpoint, presentCheckpoint);
        rowGroupIndexes.add(new RowGroupIndex(positions, columnStatistics));
    }
    Slice slice = metadataWriter.writeRowIndexes(rowGroupIndexes.build());
    Stream stream = new Stream(column, StreamKind.ROW_INDEX, slice.length(), false);
    return ImmutableList.of(new StreamDataOutput(slice, stream));
}

Also used : ColumnStatistics(com.facebook.presto.orc.metadata.statistics.ColumnStatistics) BooleanStreamCheckpoint(com.facebook.presto.orc.checkpoint.BooleanStreamCheckpoint) ImmutableList(com.google.common.collect.ImmutableList) ByteStreamCheckpoint(com.facebook.presto.orc.checkpoint.ByteStreamCheckpoint) StreamDataOutput(com.facebook.presto.orc.stream.StreamDataOutput) ByteStreamCheckpoint(com.facebook.presto.orc.checkpoint.ByteStreamCheckpoint) BooleanStreamCheckpoint(com.facebook.presto.orc.checkpoint.BooleanStreamCheckpoint) RowGroupIndex(com.facebook.presto.orc.metadata.RowGroupIndex) Slice(io.airlift.slice.Slice) ArrayList(java.util.ArrayList) ImmutableList(com.google.common.collect.ImmutableList) List(java.util.List) PresentOutputStream(com.facebook.presto.orc.stream.PresentOutputStream) Stream(com.facebook.presto.orc.metadata.Stream) ByteOutputStream(com.facebook.presto.orc.stream.ByteOutputStream)

Example 7 with ColumnStatistics

use of com.facebook.presto.orc.metadata.statistics.ColumnStatistics in project presto by prestodb.

the class TestStripeReader method testRowSize.

@Test
public void testRowSize() {
    int numberOfEntries = 10_000;
    long numRowsInGroup = MILLION;
    IntegerStatistics integerStatistics = new IntegerStatistics(0L, 0L, 0L);
    ColumnStatistics intColumnStatistics = new IntegerColumnStatistics(numRowsInGroup, null, integerStatistics);
    ColumnStatistics mapColumnStatistics = new ColumnStatistics(numRowsInGroup, null);
    ColumnStatistics mapKeyColumnStatistics = new IntegerColumnStatistics(numRowsInGroup * numberOfEntries, null, integerStatistics);
    ColumnStatistics mapValueColumnStatistics = new IntegerColumnStatistics(numRowsInGroup * numberOfEntries, null, integerStatistics);
    StreamId intStreamId = new StreamId(1, 0, Stream.StreamKind.ROW_INDEX);
    StreamId mapStreamId = new StreamId(2, 0, Stream.StreamKind.ROW_INDEX);
    StreamId mapKeyStreamId = new StreamId(3, 0, Stream.StreamKind.ROW_INDEX);
    StreamId mapValueStreamId = new StreamId(4, 0, Stream.StreamKind.ROW_INDEX);
    Map<StreamId, List<RowGroupIndex>> columnIndexes = ImmutableMap.of(intStreamId, createRowGroupIndex(intColumnStatistics), mapStreamId, createRowGroupIndex(mapColumnStatistics), mapKeyStreamId, createRowGroupIndex(mapKeyColumnStatistics), mapValueStreamId, createRowGroupIndex(mapValueColumnStatistics));
    // Each row contains 1 integer, 2 * numberOfEntries * integer (2 is for key and value).
    long expectedRowSize = INTEGER_VALUE_BYTES + 2 * numberOfEntries * INTEGER_VALUE_BYTES;
    RowGroup rowGroup = StripeReader.createRowGroup(0, Long.MAX_VALUE, numRowsInGroup, columnIndexes, ImmutableMap.of(), ImmutableMap.of());
    assertEquals(expectedRowSize, rowGroup.getMinAverageRowBytes());
}

Also used : IntegerColumnStatistics(com.facebook.presto.orc.metadata.statistics.IntegerColumnStatistics) ColumnStatistics(com.facebook.presto.orc.metadata.statistics.ColumnStatistics) IntegerColumnStatistics(com.facebook.presto.orc.metadata.statistics.IntegerColumnStatistics) List(java.util.List) ImmutableList(com.google.common.collect.ImmutableList) IntegerStatistics(com.facebook.presto.orc.metadata.statistics.IntegerStatistics) Test(org.testng.annotations.Test)

Example 8 with ColumnStatistics

use of com.facebook.presto.orc.metadata.statistics.ColumnStatistics in project presto by prestodb.

the class AggregatedOrcPageSource method writeNonNullCount.

private void writeNonNullCount(int columnIndex, BlockBuilder blockBuilder) {
    ColumnStatistics columnStatistics = footer.getFileStats().get(columnIndex + 1);
    if (!columnStatistics.hasNumberOfValues()) {
        throw new UnsupportedOperationException("Number of values not set for orc file. Set session property hive.pushdown_partial_aggregations_into_scan=false and execute query again");
    }
    blockBuilder.writeLong(columnStatistics.getNumberOfValues());
}

Also used : ColumnStatistics(com.facebook.presto.orc.metadata.statistics.ColumnStatistics)

Example 9 with ColumnStatistics

use of com.facebook.presto.orc.metadata.statistics.ColumnStatistics in project presto by prestodb.

the class AggregatedOrcPageSource method writeMinMax.

private void writeMinMax(int columnIndex, Type type, HiveType hiveType, BlockBuilder blockBuilder, boolean isMin) {
    ColumnStatistics columnStatistics = footer.getFileStats().get(columnIndex + 1);
    OrcType orcType = footer.getTypes().get(columnIndex + 1);
    if (type instanceof FixedWidthType) {
        completedBytes += ((FixedWidthType) type).getFixedSize();
    }
    String orcNoMinMaxMessage = "No min/max found for orc file. Set session property hive.pushdown_partial_aggregations_into_scan=false and execute query again";
    switch(orcType.getOrcTypeKind()) {
        case SHORT:
        case INT:
        case LONG:
            {
                Long value = isMin ? columnStatistics.getIntegerStatistics().getMin() : columnStatistics.getIntegerStatistics().getMax();
                if (value == null) {
                    throw new UnsupportedOperationException(orcNoMinMaxMessage);
                } else {
                    blockBuilder.writeLong(value);
                }
                break;
            }
        case TIMESTAMP:
        case DATE:
            {
                Integer value = isMin ? columnStatistics.getDateStatistics().getMin() : columnStatistics.getDateStatistics().getMax();
                if (value == null) {
                    throw new UnsupportedOperationException(orcNoMinMaxMessage);
                } else {
                    blockBuilder.writeLong(Long.valueOf(value));
                }
                break;
            }
        case VARCHAR:
        case CHAR:
        case STRING:
            {
                Slice value = isMin ? columnStatistics.getStringStatistics().getMin() : columnStatistics.getStringStatistics().getMax();
                if (value == null) {
                    throw new UnsupportedOperationException(orcNoMinMaxMessage);
                } else {
                    blockBuilder.writeBytes(value, 0, value.length()).closeEntry();
                    completedBytes += value.length();
                }
                break;
            }
        case FLOAT:
            {
                Double value = isMin ? columnStatistics.getDoubleStatistics().getMin() : columnStatistics.getDoubleStatistics().getMax();
                if (value == null) {
                    throw new UnsupportedOperationException(orcNoMinMaxMessage);
                } else {
                    blockBuilder.writeLong(floatToRawIntBits(value.floatValue()));
                }
                break;
            }
        case DOUBLE:
            {
                Double value = isMin ? columnStatistics.getDoubleStatistics().getMin() : columnStatistics.getDoubleStatistics().getMax();
                if (value == null) {
                    throw new UnsupportedOperationException(orcNoMinMaxMessage);
                } else {
                    type.writeDouble(blockBuilder, value);
                }
                break;
            }
        case DECIMAL:
            BigDecimal value = isMin ? columnStatistics.getDecimalStatistics().getMin() : columnStatistics.getDecimalStatistics().getMax();
            if (value == null) {
                throw new UnsupportedOperationException(orcNoMinMaxMessage);
            } else {
                Type definedType = hiveType.getType(typeManager);
                if (Decimals.isShortDecimal(definedType)) {
                    blockBuilder.writeLong(value.unscaledValue().longValue());
                } else {
                    type.writeSlice(blockBuilder, Decimals.encodeUnscaledValue(value.unscaledValue()));
                }
            }
            break;
        case BYTE:
        case BOOLEAN:
        case BINARY:
        case UNION:
        case LIST:
        case STRUCT:
        case MAP:
        default:
            throw new IllegalArgumentException("Unsupported type: " + orcType.getOrcTypeKind());
    }
}

Also used : ColumnStatistics(com.facebook.presto.orc.metadata.statistics.ColumnStatistics) HiveType(com.facebook.presto.hive.HiveType) FixedWidthType(com.facebook.presto.common.type.FixedWidthType) OrcType(com.facebook.presto.orc.metadata.OrcType) Type(com.facebook.presto.common.type.Type) OrcType(com.facebook.presto.orc.metadata.OrcType) Slice(io.airlift.slice.Slice) BigDecimal(java.math.BigDecimal) FixedWidthType(com.facebook.presto.common.type.FixedWidthType)

Example 10 with ColumnStatistics

use of com.facebook.presto.orc.metadata.statistics.ColumnStatistics in project presto by prestodb.

the class StripeReader method readStripe.

public Stripe readStripe(StripeInformation stripe, OrcAggregatedMemoryContext systemMemoryUsage, Optional<DwrfEncryptionInfo> decryptors, SharedBuffer sharedDecompressionBuffer) throws IOException {
    StripeId stripeId = new StripeId(orcDataSource.getId(), stripe.getOffset());
    // read the stripe footer
    StripeFooter stripeFooter = readStripeFooter(stripeId, stripe, systemMemoryUsage);
    // get streams for selected columns
    List<List<Stream>> allStreams = new ArrayList<>();
    allStreams.add(stripeFooter.getStreams());
    Map<StreamId, Stream> includedStreams = new HashMap<>();
    boolean hasRowGroupDictionary = addIncludedStreams(stripeFooter.getColumnEncodings(), stripeFooter.getStreams(), includedStreams);
    Map<Integer, ColumnEncoding> columnEncodings = new HashMap<>();
    Map<Integer, ColumnEncoding> stripeFooterEncodings = stripeFooter.getColumnEncodings();
    columnEncodings.putAll(stripeFooterEncodings);
    // included columns may be encrypted
    if (decryptors.isPresent()) {
        List<Slice> encryptedEncryptionGroups = stripeFooter.getStripeEncryptionGroups();
        for (Integer groupId : decryptors.get().getEncryptorGroupIds()) {
            StripeEncryptionGroup stripeEncryptionGroup = getStripeEncryptionGroup(decryptors.get().getEncryptorByGroupId(groupId), encryptedEncryptionGroups.get(groupId), dwrfEncryptionGroupColumns.get(groupId), systemMemoryUsage);
            allStreams.add(stripeEncryptionGroup.getStreams());
            columnEncodings.putAll(stripeEncryptionGroup.getColumnEncodings());
            boolean encryptedHasRowGroupDictionary = addIncludedStreams(stripeEncryptionGroup.getColumnEncodings(), stripeEncryptionGroup.getStreams(), includedStreams);
            hasRowGroupDictionary = encryptedHasRowGroupDictionary || hasRowGroupDictionary;
        }
    }
    // handle stripes with more than one row group or a dictionary
    boolean invalidCheckPoint = false;
    if ((stripe.getNumberOfRows() > rowsInRowGroup) || hasRowGroupDictionary) {
        // determine ranges of the stripe to read
        Map<StreamId, DiskRange> diskRanges = getDiskRanges(allStreams);
        diskRanges = Maps.filterKeys(diskRanges, Predicates.in(includedStreams.keySet()));
        // read the file regions
        Map<StreamId, OrcInputStream> streamsData = readDiskRanges(stripeId, diskRanges, systemMemoryUsage, decryptors, sharedDecompressionBuffer);
        // read the row index for each column
        Map<StreamId, List<RowGroupIndex>> columnIndexes = readColumnIndexes(includedStreams, streamsData, stripeId);
        if (writeValidation.isPresent()) {
            writeValidation.get().validateRowGroupStatistics(orcDataSource.getId(), stripe.getOffset(), columnIndexes);
        }
        // select the row groups matching the tuple domain
        Set<Integer> selectedRowGroups = selectRowGroups(stripe, columnIndexes);
        // if all row groups are skipped, return null
        if (selectedRowGroups.isEmpty()) {
            // set accounted memory usage to zero
            systemMemoryUsage.close();
            return null;
        }
        // value streams
        Map<StreamId, ValueInputStream<?>> valueStreams = createValueStreams(includedStreams, streamsData, columnEncodings);
        // build the dictionary streams
        InputStreamSources dictionaryStreamSources = createDictionaryStreamSources(includedStreams, valueStreams, columnEncodings);
        // build the row groups
        try {
            List<RowGroup> rowGroups = createRowGroups(stripe.getNumberOfRows(), includedStreams, valueStreams, columnIndexes, selectedRowGroups, columnEncodings);
            return new Stripe(stripe.getNumberOfRows(), columnEncodings, rowGroups, dictionaryStreamSources);
        } catch (InvalidCheckpointException e) {
            // we must fail because the length of the row group dictionary is contained in the checkpoint stream.
            if (hasRowGroupDictionary) {
                throw new OrcCorruptionException(e, orcDataSource.getId(), "Checkpoints are corrupt");
            }
            invalidCheckPoint = true;
        }
    }
    // stripe only has one row group and no dictionary
    ImmutableMap.Builder<StreamId, DiskRange> diskRangesBuilder = ImmutableMap.builder();
    for (Entry<StreamId, DiskRange> entry : getDiskRanges(allStreams).entrySet()) {
        StreamId streamId = entry.getKey();
        if (includedStreams.keySet().contains(streamId)) {
            diskRangesBuilder.put(entry);
        }
    }
    ImmutableMap<StreamId, DiskRange> diskRanges = diskRangesBuilder.build();
    // read the file regions
    Map<StreamId, OrcInputStream> streamsData = readDiskRanges(stripeId, diskRanges, systemMemoryUsage, decryptors, sharedDecompressionBuffer);
    long totalBytes = 0;
    for (Entry<StreamId, Stream> entry : includedStreams.entrySet()) {
        if (entry.getKey().getStreamKind() == ROW_INDEX) {
            List<RowGroupIndex> rowGroupIndexes = metadataReader.readRowIndexes(hiveWriterVersion, streamsData.get(entry.getKey()), null);
            checkState(rowGroupIndexes.size() == 1 || invalidCheckPoint, "expect a single row group or an invalid check point");
            for (RowGroupIndex rowGroupIndex : rowGroupIndexes) {
                ColumnStatistics columnStatistics = rowGroupIndex.getColumnStatistics();
                if (columnStatistics.hasMinAverageValueSizeInBytes()) {
                    totalBytes += columnStatistics.getTotalValueSizeInBytes();
                }
            }
        }
    }
    // value streams
    Map<StreamId, ValueInputStream<?>> valueStreams = createValueStreams(includedStreams, streamsData, columnEncodings);
    // build the dictionary streams
    InputStreamSources dictionaryStreamSources = createDictionaryStreamSources(includedStreams, valueStreams, columnEncodings);
    // build the row group
    ImmutableMap.Builder<StreamId, InputStreamSource<?>> builder = ImmutableMap.builder();
    for (Entry<StreamId, ValueInputStream<?>> entry : valueStreams.entrySet()) {
        builder.put(entry.getKey(), new ValueInputStreamSource<>(entry.getValue()));
    }
    RowGroup rowGroup = new RowGroup(0, 0, stripe.getNumberOfRows(), totalBytes, new InputStreamSources(builder.build()));
    return new Stripe(stripe.getNumberOfRows(), columnEncodings, ImmutableList.of(rowGroup), dictionaryStreamSources);
}

Also used : ValueInputStream(com.facebook.presto.orc.stream.ValueInputStream) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) InvalidCheckpointException(com.facebook.presto.orc.checkpoint.InvalidCheckpointException) InputStreamSource(com.facebook.presto.orc.stream.InputStreamSource) ValueInputStreamSource(com.facebook.presto.orc.stream.ValueInputStreamSource) List(java.util.List) ArrayList(java.util.ArrayList) ImmutableList(com.google.common.collect.ImmutableList) ValueInputStream(com.facebook.presto.orc.stream.ValueInputStream) OrcInputStream(com.facebook.presto.orc.stream.OrcInputStream) Stream(com.facebook.presto.orc.metadata.Stream) InputStream(java.io.InputStream) ColumnStatistics(com.facebook.presto.orc.metadata.statistics.ColumnStatistics) ColumnStatistics.mergeColumnStatistics(com.facebook.presto.orc.metadata.statistics.ColumnStatistics.mergeColumnStatistics) OrcInputStream(com.facebook.presto.orc.stream.OrcInputStream) ImmutableMap(com.google.common.collect.ImmutableMap) ColumnEncoding(com.facebook.presto.orc.metadata.ColumnEncoding) InputStreamSources(com.facebook.presto.orc.stream.InputStreamSources) StripeFooter(com.facebook.presto.orc.metadata.StripeFooter) RowGroupIndex(com.facebook.presto.orc.metadata.RowGroupIndex) Slice(io.airlift.slice.Slice) StripeEncryptionGroup(com.facebook.presto.orc.metadata.StripeEncryptionGroup) DwrfMetadataReader.toStripeEncryptionGroup(com.facebook.presto.orc.metadata.DwrfMetadataReader.toStripeEncryptionGroup)

Aggregations

ColumnStatistics (com.facebook.presto.orc.metadata.statistics.ColumnStatistics)46 ImmutableList (com.google.common.collect.ImmutableList)22 Slice (io.airlift.slice.Slice)22 List (java.util.List)22 ArrayList (java.util.ArrayList)19 Stream (com.facebook.presto.orc.metadata.Stream)18 StreamDataOutput (com.facebook.presto.orc.stream.StreamDataOutput)15 RowGroupIndex (com.facebook.presto.orc.metadata.RowGroupIndex)14 BooleanStreamCheckpoint (com.facebook.presto.orc.checkpoint.BooleanStreamCheckpoint)12 PresentOutputStream (com.facebook.presto.orc.stream.PresentOutputStream)12 ImmutableMap (com.google.common.collect.ImmutableMap)11 LongStreamCheckpoint (com.facebook.presto.orc.checkpoint.LongStreamCheckpoint)7 OrcType (com.facebook.presto.orc.metadata.OrcType)7 LongOutputStream (com.facebook.presto.orc.stream.LongOutputStream)7 Map (java.util.Map)7 Type (com.facebook.presto.common.type.Type)6 IOException (java.io.IOException)6 HashMap (java.util.HashMap)6 ColumnEncoding (com.facebook.presto.orc.metadata.ColumnEncoding)5 StripeEncryptionGroup (com.facebook.presto.orc.metadata.StripeEncryptionGroup)5