Search in sources :

Example 66 with ColumnStatistics

use of com.facebook.presto.orc.metadata.statistics.ColumnStatistics in project urban-eureka by errir503.

the class ColumnWriterUtils method buildRowGroupIndexes.

/**
 * Build RowGroupIndex using column statistics and checkpoints.
 */
@SafeVarargs
public static List<RowGroupIndex> buildRowGroupIndexes(boolean compressed, List<ColumnStatistics> rowGroupColumnStatistics, Optional<List<? extends StreamCheckpoint>> prependCheckpoints, PresentOutputStream presentStream, ValueOutputStream<? extends StreamCheckpoint>... dataStreams) {
    ImmutableList.Builder<RowGroupIndex> rowGroupIndexes = ImmutableList.builder();
    Optional<List<BooleanStreamCheckpoint>> presentCheckpoints = presentStream.getCheckpoints();
    List<List<? extends StreamCheckpoint>> dataCheckpoints = Arrays.stream(dataStreams).map(ValueOutputStream::getCheckpoints).collect(Collectors.toList());
    for (int i = 0; i < rowGroupColumnStatistics.size(); i++) {
        int groupId = i;
        Optional<StreamCheckpoint> prependCheckpoint = prependCheckpoints.map(checkpoints -> checkpoints.get(groupId));
        Optional<StreamCheckpoint> presentCheckpoint = presentCheckpoints.map(checkpoints -> checkpoints.get(groupId));
        // prepend and present checkpoints always come first
        ImmutableList.Builder<Integer> positions = ImmutableList.builder();
        prependCheckpoint.ifPresent(checkpoint -> positions.addAll(checkpoint.toPositionList(compressed)));
        presentCheckpoint.ifPresent(checkpoint -> positions.addAll(checkpoint.toPositionList(compressed)));
        // add data checkpoints
        for (List<? extends StreamCheckpoint> dataCheckpoint : dataCheckpoints) {
            StreamCheckpoint streamCheckpoint = dataCheckpoint.get(groupId);
            positions.addAll(streamCheckpoint.toPositionList(compressed));
        }
        ColumnStatistics columnStatistics = rowGroupColumnStatistics.get(groupId);
        rowGroupIndexes.add(new RowGroupIndex(positions.build(), columnStatistics));
    }
    return rowGroupIndexes.build();
}
Also used : ColumnStatistics(com.facebook.presto.orc.metadata.statistics.ColumnStatistics) ImmutableList(com.google.common.collect.ImmutableList) StreamCheckpoint(com.facebook.presto.orc.checkpoint.StreamCheckpoint) BooleanStreamCheckpoint(com.facebook.presto.orc.checkpoint.BooleanStreamCheckpoint) RowGroupIndex(com.facebook.presto.orc.metadata.RowGroupIndex) List(java.util.List) ImmutableList(com.google.common.collect.ImmutableList) StreamCheckpoint(com.facebook.presto.orc.checkpoint.StreamCheckpoint) BooleanStreamCheckpoint(com.facebook.presto.orc.checkpoint.BooleanStreamCheckpoint)

Example 67 with ColumnStatistics

use of com.facebook.presto.orc.metadata.statistics.ColumnStatistics in project urban-eureka by errir503.

the class ColumnWriters method createColumnWriter.

/**
 * Creates a column writer for a given type.
 *
 * @param nodeIndex - index of the node in the orcTypes
 */
public static ColumnWriter createColumnWriter(int nodeIndex, int sequence, List<OrcType> orcTypes, Type type, ColumnWriterOptions columnWriterOptions, OrcEncoding orcEncoding, DateTimeZone hiveStorageTimeZone, DwrfEncryptionInfo dwrfEncryptors, MetadataWriter metadataWriter) {
    requireNonNull(type, "type is null");
    OrcType orcType = orcTypes.get(nodeIndex);
    Optional<DwrfDataEncryptor> dwrfEncryptor = dwrfEncryptors.getEncryptorByNodeId(nodeIndex);
    switch(orcType.getOrcTypeKind()) {
        case BOOLEAN:
            return new BooleanColumnWriter(nodeIndex, sequence, type, columnWriterOptions, dwrfEncryptor, metadataWriter);
        case FLOAT:
            return new FloatColumnWriter(nodeIndex, sequence, type, columnWriterOptions, dwrfEncryptor, metadataWriter);
        case DOUBLE:
            return new DoubleColumnWriter(nodeIndex, sequence, type, columnWriterOptions, dwrfEncryptor, metadataWriter);
        case BYTE:
            return new ByteColumnWriter(nodeIndex, sequence, type, columnWriterOptions, dwrfEncryptor, metadataWriter);
        case DATE:
            checkArgument(orcEncoding != DWRF, "DWRF does not support %s type", type);
            return new LongColumnWriter(nodeIndex, DEFAULT_SEQUENCE_ID, type, columnWriterOptions, dwrfEncryptor, orcEncoding, DateStatisticsBuilder::new, metadataWriter);
        case SHORT:
            return new LongColumnWriter(nodeIndex, sequence, type, columnWriterOptions, dwrfEncryptor, orcEncoding, IntegerStatisticsBuilder::new, metadataWriter);
        case INT:
        case LONG:
            if (columnWriterOptions.isIntegerDictionaryEncodingEnabled() && orcEncoding == DWRF) {
                // ORC V1 does not support Integer Dictionary encoding. DWRF supports Integer dictionary encoding.
                return new LongDictionaryColumnWriter(nodeIndex, sequence, type, columnWriterOptions, dwrfEncryptor, orcEncoding, metadataWriter);
            }
            return new LongColumnWriter(nodeIndex, sequence, type, columnWriterOptions, dwrfEncryptor, orcEncoding, IntegerStatisticsBuilder::new, metadataWriter);
        case DECIMAL:
            checkArgument(orcEncoding != DWRF, "DWRF does not support %s type", type);
            return new DecimalColumnWriter(nodeIndex, type, columnWriterOptions, orcEncoding, metadataWriter);
        case TIMESTAMP:
        case TIMESTAMP_MICROSECONDS:
            return new TimestampColumnWriter(nodeIndex, sequence, type, columnWriterOptions, dwrfEncryptor, orcEncoding, hiveStorageTimeZone, metadataWriter);
        case BINARY:
            return new SliceDirectColumnWriter(nodeIndex, sequence, type, columnWriterOptions, dwrfEncryptor, orcEncoding, BinaryStatisticsBuilder::new, metadataWriter);
        case CHAR:
            checkArgument(orcEncoding != DWRF, "DWRF does not support %s type", type);
        // fall through
        case VARCHAR:
        case STRING:
            if (columnWriterOptions.isStringDictionaryEncodingEnabled()) {
                return new SliceDictionaryColumnWriter(nodeIndex, sequence, type, columnWriterOptions, dwrfEncryptor, orcEncoding, metadataWriter);
            }
            int stringStatisticsLimit = columnWriterOptions.getStringStatisticsLimit();
            return new SliceDirectColumnWriter(nodeIndex, sequence, type, columnWriterOptions, dwrfEncryptor, orcEncoding, () -> new StringStatisticsBuilder(stringStatisticsLimit), metadataWriter);
        case LIST:
            {
                Type fieldType = type.getTypeParameters().get(0);
                ColumnWriter elementWriter = createColumnWriter(orcType.getFieldTypeIndex(0), sequence, orcTypes, fieldType, columnWriterOptions, orcEncoding, hiveStorageTimeZone, dwrfEncryptors, metadataWriter);
                return new ListColumnWriter(nodeIndex, sequence, columnWriterOptions, dwrfEncryptor, orcEncoding, elementWriter, metadataWriter);
            }
        case MAP:
            {
                if (columnWriterOptions.getFlattenedNodes().contains(nodeIndex)) {
                    checkArgument(orcEncoding == DWRF, "%s does not support flat maps", orcEncoding);
                    Type valueType = type.getTypeParameters().get(1);
                    OrcType keyOrcType = orcTypes.get(orcType.getFieldTypeIndex(0));
                    Supplier<StatisticsBuilder> keyStatisticsBuilderSupplier = createStatisticsBuilderSupplier(keyOrcType, columnWriterOptions);
                    // value writers should not create their own expensive dictionaries, instead they should use shared dictionaries
                    ColumnWriterOptions valueWriterColumnWriterOptions = columnWriterOptions.copyWithDisabledDictionaryEncoding();
                    // Flat map writer needs to provide column statistics for the value node(s) even if there are no values.
                    // This lambda will provide empty column statistics right away instead of creating an expensive empty
                    // value writer and getting empty stats from it.
                    int valueNodeIndex = orcType.getFieldTypeIndex(1);
                    Supplier<Map<Integer, ColumnStatistics>> emptyColumnStatisticsSupplier = () -> createEmptyColumnStatistics(orcTypes, valueNodeIndex, columnWriterOptions);
                    IntFunction<ColumnWriter> valueWriterSupplier = (valueSequence) -> createColumnWriter(valueNodeIndex, valueSequence, orcTypes, valueType, valueWriterColumnWriterOptions, orcEncoding, hiveStorageTimeZone, dwrfEncryptors, metadataWriter);
                    return new MapFlatColumnWriter(nodeIndex, orcType.getFieldTypeIndex(0), valueNodeIndex, type.getTypeParameters().get(0), valueType, keyStatisticsBuilderSupplier, columnWriterOptions, dwrfEncryptor, metadataWriter, valueWriterSupplier, emptyColumnStatisticsSupplier);
                }
                ColumnWriter keyWriter = createColumnWriter(orcType.getFieldTypeIndex(0), sequence, orcTypes, type.getTypeParameters().get(0), columnWriterOptions, orcEncoding, hiveStorageTimeZone, dwrfEncryptors, metadataWriter);
                ColumnWriter valueWriter = createColumnWriter(orcType.getFieldTypeIndex(1), sequence, orcTypes, type.getTypeParameters().get(1), columnWriterOptions, orcEncoding, hiveStorageTimeZone, dwrfEncryptors, metadataWriter);
                return new MapColumnWriter(nodeIndex, sequence, columnWriterOptions, dwrfEncryptor, orcEncoding, keyWriter, valueWriter, metadataWriter);
            }
        case STRUCT:
            {
                ImmutableList.Builder<ColumnWriter> fieldWriters = ImmutableList.builder();
                for (int fieldId = 0; fieldId < orcType.getFieldCount(); fieldId++) {
                    int childNodeIndex = orcType.getFieldTypeIndex(fieldId);
                    Type fieldType = type.getTypeParameters().get(fieldId);
                    fieldWriters.add(createColumnWriter(childNodeIndex, sequence, orcTypes, fieldType, columnWriterOptions, orcEncoding, hiveStorageTimeZone, dwrfEncryptors, metadataWriter));
                }
                return new StructColumnWriter(nodeIndex, sequence, columnWriterOptions, dwrfEncryptor, fieldWriters.build(), metadataWriter);
            }
    }
    throw new IllegalArgumentException("Unsupported type: " + type);
}
Also used : StringStatisticsBuilder(com.facebook.presto.orc.metadata.statistics.StringStatisticsBuilder) BinaryStatisticsBuilder(com.facebook.presto.orc.metadata.statistics.BinaryStatisticsBuilder) DateStatisticsBuilder(com.facebook.presto.orc.metadata.statistics.DateStatisticsBuilder) IntegerStatisticsBuilder(com.facebook.presto.orc.metadata.statistics.IntegerStatisticsBuilder) StatisticsBuilder(com.facebook.presto.orc.metadata.statistics.StatisticsBuilder) StringStatisticsBuilder(com.facebook.presto.orc.metadata.statistics.StringStatisticsBuilder) IntegerStatisticsBuilder(com.facebook.presto.orc.metadata.statistics.IntegerStatisticsBuilder) Supplier(java.util.function.Supplier) StatisticsBuilders.createStatisticsBuilderSupplier(com.facebook.presto.orc.metadata.statistics.StatisticsBuilders.createStatisticsBuilderSupplier) DateStatisticsBuilder(com.facebook.presto.orc.metadata.statistics.DateStatisticsBuilder) StatisticsBuilders.createEmptyColumnStatistics(com.facebook.presto.orc.metadata.statistics.StatisticsBuilders.createEmptyColumnStatistics) ColumnStatistics(com.facebook.presto.orc.metadata.statistics.ColumnStatistics) BinaryStatisticsBuilder(com.facebook.presto.orc.metadata.statistics.BinaryStatisticsBuilder) DwrfDataEncryptor(com.facebook.presto.orc.DwrfDataEncryptor) OrcType(com.facebook.presto.orc.metadata.OrcType) Type(com.facebook.presto.common.type.Type) ColumnWriterOptions(com.facebook.presto.orc.ColumnWriterOptions) OrcType(com.facebook.presto.orc.metadata.OrcType) IntFunction(java.util.function.IntFunction)

Example 68 with ColumnStatistics

use of com.facebook.presto.orc.metadata.statistics.ColumnStatistics in project urban-eureka by errir503.

the class DoubleColumnWriter method finishRowGroup.

@Override
public Map<Integer, ColumnStatistics> finishRowGroup() {
    checkState(!closed);
    ColumnStatistics statistics = statisticsBuilder.buildColumnStatistics();
    rowGroupColumnStatistics.add(statistics);
    columnStatisticsRetainedSizeInBytes += statistics.getRetainedSizeInBytes();
    statisticsBuilder = new DoubleStatisticsBuilder();
    return ImmutableMap.of(column, statistics);
}
Also used : ColumnStatistics(com.facebook.presto.orc.metadata.statistics.ColumnStatistics) DoubleStatisticsBuilder(com.facebook.presto.orc.metadata.statistics.DoubleStatisticsBuilder)

Example 69 with ColumnStatistics

use of com.facebook.presto.orc.metadata.statistics.ColumnStatistics in project urban-eureka by errir503.

the class FloatColumnWriter method finishRowGroup.

@Override
public Map<Integer, ColumnStatistics> finishRowGroup() {
    checkState(!closed);
    ColumnStatistics statistics = statisticsBuilder.buildColumnStatistics();
    rowGroupColumnStatistics.add(statistics);
    columnStatisticsRetainedSizeInBytes += statistics.getRetainedSizeInBytes();
    statisticsBuilder = new DoubleStatisticsBuilder();
    return ImmutableMap.of(column, statistics);
}
Also used : ColumnStatistics(com.facebook.presto.orc.metadata.statistics.ColumnStatistics) DoubleStatisticsBuilder(com.facebook.presto.orc.metadata.statistics.DoubleStatisticsBuilder)

Example 70 with ColumnStatistics

use of com.facebook.presto.orc.metadata.statistics.ColumnStatistics in project urban-eureka by errir503.

the class SliceDirectColumnWriter method finishRowGroup.

@Override
public Map<Integer, ColumnStatistics> finishRowGroup() {
    checkState(!closed);
    ColumnStatistics statistics = statisticsBuilder.buildColumnStatistics();
    rowGroupColumnStatistics.add(statistics);
    columnStatisticsRetainedSizeInBytes += statistics.getRetainedSizeInBytes();
    statisticsBuilder = statisticsBuilderSupplier.get();
    return ImmutableMap.of(column, statistics);
}
Also used : ColumnStatistics(com.facebook.presto.orc.metadata.statistics.ColumnStatistics)

Aggregations

ColumnStatistics (com.facebook.presto.orc.metadata.statistics.ColumnStatistics)99 ImmutableList (com.google.common.collect.ImmutableList)46 Slice (io.airlift.slice.Slice)46 List (java.util.List)46 Stream (com.facebook.presto.orc.metadata.Stream)38 ArrayList (java.util.ArrayList)38 RowGroupIndex (com.facebook.presto.orc.metadata.RowGroupIndex)32 StreamDataOutput (com.facebook.presto.orc.stream.StreamDataOutput)32 BooleanStreamCheckpoint (com.facebook.presto.orc.checkpoint.BooleanStreamCheckpoint)26 PresentOutputStream (com.facebook.presto.orc.stream.PresentOutputStream)26 ImmutableMap (com.google.common.collect.ImmutableMap)23 LongOutputStream (com.facebook.presto.orc.stream.LongOutputStream)16 OrcType (com.facebook.presto.orc.metadata.OrcType)15 LongStreamCheckpoint (com.facebook.presto.orc.checkpoint.LongStreamCheckpoint)14 Map (java.util.Map)14 Type (com.facebook.presto.common.type.Type)13 ImmutableList.toImmutableList (com.google.common.collect.ImmutableList.toImmutableList)12 IOException (java.io.IOException)12 HashMap (java.util.HashMap)12 Optional (java.util.Optional)12