Search in sources :

Example 11 with OrcColumnId

use of io.trino.orc.metadata.OrcColumnId in project trino by trinodb.

the class StripeReader method readBloomFilterIndexes.

private Map<OrcColumnId, List<BloomFilter>> readBloomFilterIndexes(Map<StreamId, Stream> streams, Map<StreamId, OrcChunkLoader> streamsData) throws IOException {
    HashMap<OrcColumnId, List<BloomFilter>> bloomFilters = new HashMap<>();
    for (Entry<StreamId, Stream> entry : streams.entrySet()) {
        Stream stream = entry.getValue();
        if (stream.getStreamKind() == BLOOM_FILTER_UTF8) {
            OrcInputStream inputStream = new OrcInputStream(streamsData.get(entry.getKey()));
            bloomFilters.put(stream.getColumnId(), metadataReader.readBloomFilterIndexes(inputStream));
        }
    }
    for (Entry<StreamId, Stream> entry : streams.entrySet()) {
        Stream stream = entry.getValue();
        if (stream.getStreamKind() == BLOOM_FILTER && !bloomFilters.containsKey(stream.getColumnId())) {
            OrcInputStream inputStream = new OrcInputStream(streamsData.get(entry.getKey()));
            bloomFilters.put(entry.getKey().getColumnId(), metadataReader.readBloomFilterIndexes(inputStream));
        }
    }
    return ImmutableMap.copyOf(bloomFilters);
}
Also used : OrcColumnId(io.trino.orc.metadata.OrcColumnId) OrcInputStream(io.trino.orc.stream.OrcInputStream) HashMap(java.util.HashMap) List(java.util.List) ArrayList(java.util.ArrayList) ImmutableList(com.google.common.collect.ImmutableList) Stream(io.trino.orc.metadata.Stream) OrcInputStream(io.trino.orc.stream.OrcInputStream) ValueInputStream(io.trino.orc.stream.ValueInputStream) InputStream(java.io.InputStream)

Example 12 with OrcColumnId

use of io.trino.orc.metadata.OrcColumnId in project trino by trinodb.

the class Checkpoints method getStreamCheckpoints.

public static Map<StreamId, StreamCheckpoint> getStreamCheckpoints(Set<OrcColumnId> columns, ColumnMetadata<OrcType> columnTypes, boolean compressed, int rowGroupId, ColumnMetadata<ColumnEncoding> columnEncodings, Map<StreamId, Stream> streams, Map<StreamId, List<RowGroupIndex>> columnIndexes) throws InvalidCheckpointException {
    ImmutableSetMultimap.Builder<OrcColumnId, StreamKind> streamKindsBuilder = ImmutableSetMultimap.builder();
    for (Stream stream : streams.values()) {
        streamKindsBuilder.put(stream.getColumnId(), stream.getStreamKind());
    }
    SetMultimap<OrcColumnId, StreamKind> streamKinds = streamKindsBuilder.build();
    ImmutableMap.Builder<StreamId, StreamCheckpoint> checkpoints = ImmutableMap.builder();
    for (Map.Entry<StreamId, List<RowGroupIndex>> entry : columnIndexes.entrySet()) {
        OrcColumnId columnId = entry.getKey().getColumnId();
        if (!columns.contains(columnId)) {
            continue;
        }
        List<Integer> positionsList = entry.getValue().get(rowGroupId).getPositions();
        ColumnEncodingKind columnEncoding = columnEncodings.get(columnId).getColumnEncodingKind();
        OrcTypeKind columnType = columnTypes.get(columnId).getOrcTypeKind();
        Set<StreamKind> availableStreams = streamKinds.get(columnId);
        ColumnPositionsList columnPositionsList = new ColumnPositionsList(columnId, columnType, positionsList);
        switch(columnType) {
            case BOOLEAN:
                checkpoints.putAll(getBooleanColumnCheckpoints(columnId, compressed, availableStreams, columnPositionsList));
                break;
            case BYTE:
                checkpoints.putAll(getByteColumnCheckpoints(columnId, compressed, availableStreams, columnPositionsList));
                break;
            case SHORT:
            case INT:
            case LONG:
            case DATE:
                checkpoints.putAll(getLongColumnCheckpoints(columnId, columnEncoding, compressed, availableStreams, columnPositionsList));
                break;
            case FLOAT:
                checkpoints.putAll(getFloatColumnCheckpoints(columnId, compressed, availableStreams, columnPositionsList));
                break;
            case DOUBLE:
                checkpoints.putAll(getDoubleColumnCheckpoints(columnId, compressed, availableStreams, columnPositionsList));
                break;
            case TIMESTAMP:
            case TIMESTAMP_INSTANT:
                checkpoints.putAll(getTimestampColumnCheckpoints(columnId, columnEncoding, compressed, availableStreams, columnPositionsList));
                break;
            case BINARY:
            case STRING:
            case VARCHAR:
            case CHAR:
                checkpoints.putAll(getSliceColumnCheckpoints(columnId, columnEncoding, compressed, availableStreams, columnPositionsList));
                break;
            case LIST:
            case MAP:
                checkpoints.putAll(getListOrMapColumnCheckpoints(columnId, columnEncoding, compressed, availableStreams, columnPositionsList));
                break;
            case STRUCT:
                checkpoints.putAll(getStructColumnCheckpoints(columnId, compressed, availableStreams, columnPositionsList));
                break;
            case DECIMAL:
                checkpoints.putAll(getDecimalColumnCheckpoints(columnId, columnEncoding, compressed, availableStreams, columnPositionsList));
                break;
            default:
                throw new IllegalArgumentException("Unsupported column type " + columnType);
        }
    }
    return checkpoints.buildOrThrow();
}
Also used : OrcColumnId(io.trino.orc.metadata.OrcColumnId) StreamId(io.trino.orc.StreamId) StreamKind(io.trino.orc.metadata.Stream.StreamKind) ImmutableSetMultimap(com.google.common.collect.ImmutableSetMultimap) OrcTypeKind(io.trino.orc.metadata.OrcType.OrcTypeKind) ImmutableMap(com.google.common.collect.ImmutableMap) Stream(io.trino.orc.metadata.Stream) ImmutableList(com.google.common.collect.ImmutableList) List(java.util.List) Map(java.util.Map) ImmutableMap(com.google.common.collect.ImmutableMap) InputStreamCheckpoint.createInputStreamCheckpoint(io.trino.orc.checkpoint.InputStreamCheckpoint.createInputStreamCheckpoint) ColumnEncodingKind(io.trino.orc.metadata.ColumnEncoding.ColumnEncodingKind)

Example 13 with OrcColumnId

use of io.trino.orc.metadata.OrcColumnId in project trino by trinodb.

the class TypeConverter method toOrcMapType.

private static List<OrcType> toOrcMapType(int nextFieldTypeIndex, Types.MapType mapType, Map<String, String> attributes) {
    nextFieldTypeIndex++;
    Map<String, String> keyAttributes = ImmutableMap.<String, String>builder().put(ORC_ICEBERG_ID_KEY, Integer.toString(mapType.keyId())).put(ORC_ICEBERG_REQUIRED_KEY, Boolean.toString(true)).buildOrThrow();
    List<OrcType> keyTypes = toOrcType(nextFieldTypeIndex, mapType.keyType(), keyAttributes);
    Map<String, String> valueAttributes = ImmutableMap.<String, String>builder().put(ORC_ICEBERG_ID_KEY, Integer.toString(mapType.valueId())).put(ORC_ICEBERG_REQUIRED_KEY, Boolean.toString(mapType.isValueRequired())).buildOrThrow();
    List<OrcType> valueTypes = toOrcType(nextFieldTypeIndex + keyTypes.size(), mapType.valueType(), valueAttributes);
    List<OrcType> orcTypes = new ArrayList<>();
    orcTypes.add(new OrcType(OrcTypeKind.MAP, ImmutableList.of(new OrcColumnId(nextFieldTypeIndex), new OrcColumnId(nextFieldTypeIndex + keyTypes.size())), ImmutableList.of("key", "value"), Optional.empty(), Optional.empty(), Optional.empty(), attributes));
    orcTypes.addAll(keyTypes);
    orcTypes.addAll(valueTypes);
    return orcTypes;
}
Also used : OrcColumnId(io.trino.orc.metadata.OrcColumnId) OrcType(io.trino.orc.metadata.OrcType) ArrayList(java.util.ArrayList)

Example 14 with OrcColumnId

use of io.trino.orc.metadata.OrcColumnId in project trino by trinodb.

the class IcebergOrcFileWriter method computeMetrics.

private static Metrics computeMetrics(MetricsConfig metricsConfig, Schema icebergSchema, ColumnMetadata<OrcType> orcColumns, long fileRowCount, Optional<ColumnMetadata<ColumnStatistics>> columnStatistics) {
    if (columnStatistics.isEmpty()) {
        return new Metrics(fileRowCount, null, null, null, null, null, null);
    }
    // Columns that are descendants of LIST or MAP types are excluded because:
    // 1. Their stats are not used by Apache Iceberg to filter out data files
    // 2. Their record count can be larger than table-level row count. There's no good way to calculate nullCounts for them.
    // See https://github.com/apache/iceberg/pull/199#discussion_r429443627
    Set<OrcColumnId> excludedColumns = getExcludedColumns(orcColumns);
    ImmutableMap.Builder<Integer, Long> valueCountsBuilder = ImmutableMap.builder();
    ImmutableMap.Builder<Integer, Long> nullCountsBuilder = ImmutableMap.builder();
    ImmutableMap.Builder<Integer, ByteBuffer> lowerBoundsBuilder = ImmutableMap.builder();
    ImmutableMap.Builder<Integer, ByteBuffer> upperBoundsBuilder = ImmutableMap.builder();
    // OrcColumnId(0) is the root column that represents file-level schema
    for (int i = 1; i < orcColumns.size(); i++) {
        OrcColumnId orcColumnId = new OrcColumnId(i);
        if (excludedColumns.contains(orcColumnId)) {
            continue;
        }
        OrcType orcColumn = orcColumns.get(orcColumnId);
        ColumnStatistics orcColumnStats = columnStatistics.get().get(orcColumnId);
        int icebergId = getIcebergId(orcColumn);
        Types.NestedField icebergField = icebergSchema.findField(icebergId);
        MetricsModes.MetricsMode metricsMode = MetricsUtil.metricsMode(icebergSchema, metricsConfig, icebergId);
        if (metricsMode.equals(MetricsModes.None.get())) {
            continue;
        }
        verify(icebergField != null, "Cannot find Iceberg column with ID %s in schema %s", icebergId, icebergSchema);
        valueCountsBuilder.put(icebergId, fileRowCount);
        if (orcColumnStats.hasNumberOfValues()) {
            nullCountsBuilder.put(icebergId, fileRowCount - orcColumnStats.getNumberOfValues());
        }
        if (!metricsMode.equals(MetricsModes.Counts.get())) {
            toIcebergMinMax(orcColumnStats, icebergField.type(), metricsMode).ifPresent(minMax -> {
                lowerBoundsBuilder.put(icebergId, minMax.getMin());
                upperBoundsBuilder.put(icebergId, minMax.getMax());
            });
        }
    }
    Map<Integer, Long> valueCounts = valueCountsBuilder.buildOrThrow();
    Map<Integer, Long> nullCounts = nullCountsBuilder.buildOrThrow();
    Map<Integer, ByteBuffer> lowerBounds = lowerBoundsBuilder.buildOrThrow();
    Map<Integer, ByteBuffer> upperBounds = upperBoundsBuilder.buildOrThrow();
    return new Metrics(fileRowCount, // TODO: Add column size accounting to ORC column writers
    null, valueCounts.isEmpty() ? null : valueCounts, nullCounts.isEmpty() ? null : nullCounts, // TODO: Add nanValueCounts to ORC writer
    null, lowerBounds.isEmpty() ? null : lowerBounds, upperBounds.isEmpty() ? null : upperBounds);
}
Also used : ColumnStatistics(io.trino.orc.metadata.statistics.ColumnStatistics) OrcColumnId(io.trino.orc.metadata.OrcColumnId) Types(org.apache.iceberg.types.Types) ByteBuffer(java.nio.ByteBuffer) ImmutableMap(com.google.common.collect.ImmutableMap) MetricsModes(org.apache.iceberg.MetricsModes) Metrics(org.apache.iceberg.Metrics) OrcType(io.trino.orc.metadata.OrcType)

Example 15 with OrcColumnId

use of io.trino.orc.metadata.OrcColumnId in project trino by trinodb.

the class TestReadBloomFilter method testType.

private static <T> void testType(Type type, List<T> uniqueValues, T inBloomFilter, T notInBloomFilter) throws Exception {
    Stream<T> writeValues = newArrayList(limit(cycle(uniqueValues), 30_000)).stream();
    try (TempFile tempFile = new TempFile()) {
        writeOrcColumnHive(tempFile.getFile(), ORC_12, LZ4, type, writeValues.iterator());
        // without predicate a normal block will be created
        try (OrcRecordReader recordReader = createCustomOrcRecordReader(tempFile, OrcPredicate.TRUE, type, MAX_BATCH_SIZE)) {
            assertEquals(recordReader.nextPage().getLoadedPage().getPositionCount(), 8196);
        }
        // predicate for specific value within the min/max range without bloom filter being enabled
        TupleDomainOrcPredicate noBloomFilterPredicate = TupleDomainOrcPredicate.builder().addColumn(new OrcColumnId(1), Domain.singleValue(type, notInBloomFilter)).build();
        try (OrcRecordReader recordReader = createCustomOrcRecordReader(tempFile, noBloomFilterPredicate, type, MAX_BATCH_SIZE)) {
            assertEquals(recordReader.nextPage().getLoadedPage().getPositionCount(), 8196);
        }
        // predicate for specific value within the min/max range with bloom filter enabled, but a value not in the bloom filter
        TupleDomainOrcPredicate notMatchBloomFilterPredicate = TupleDomainOrcPredicate.builder().addColumn(new OrcColumnId(1), Domain.singleValue(type, notInBloomFilter)).setBloomFiltersEnabled(true).build();
        try (OrcRecordReader recordReader = createCustomOrcRecordReader(tempFile, notMatchBloomFilterPredicate, type, MAX_BATCH_SIZE)) {
            assertNull(recordReader.nextPage());
        }
        // predicate for specific value within the min/max range with bloom filter enabled, and a value in the bloom filter
        TupleDomainOrcPredicate matchBloomFilterPredicate = TupleDomainOrcPredicate.builder().addColumn(new OrcColumnId(1), Domain.singleValue(type, inBloomFilter)).setBloomFiltersEnabled(true).build();
        try (OrcRecordReader recordReader = createCustomOrcRecordReader(tempFile, matchBloomFilterPredicate, type, MAX_BATCH_SIZE)) {
            assertEquals(recordReader.nextPage().getLoadedPage().getPositionCount(), 8196);
        }
    }
}
Also used : OrcColumnId(io.trino.orc.metadata.OrcColumnId) SMALLINT(io.trino.spi.type.SmallintType.SMALLINT) BIGINT(io.trino.spi.type.BigintType.BIGINT) TINYINT(io.trino.spi.type.TinyintType.TINYINT)

Aggregations

OrcColumnId (io.trino.orc.metadata.OrcColumnId)24 ImmutableMap (com.google.common.collect.ImmutableMap)10 Stream (io.trino.orc.metadata.Stream)9 ColumnStatistics (io.trino.orc.metadata.statistics.ColumnStatistics)9 ArrayList (java.util.ArrayList)9 OrcType (io.trino.orc.metadata.OrcType)8 List (java.util.List)8 ImmutableList (com.google.common.collect.ImmutableList)7 Slice (io.airlift.slice.Slice)5 CompressionKind (io.trino.orc.metadata.CompressionKind)5 Map (java.util.Map)5 ImmutableList.toImmutableList (com.google.common.collect.ImmutableList.toImmutableList)4 Footer (io.trino.orc.metadata.Footer)4 OrcInputStream (io.trino.orc.stream.OrcInputStream)4 Page (io.trino.spi.Page)4 IOException (java.io.IOException)4 InputStream (java.io.InputStream)4 ByteBuffer (java.nio.ByteBuffer)3 Configuration (org.apache.hadoop.conf.Configuration)3 Path (org.apache.hadoop.fs.Path)3