Search in sources :

Example 6 with OrcColumnId

use of io.trino.orc.metadata.OrcColumnId in project trino by trinodb.

the class TestOrcReaderPositions method testStripeSkipping.

@Test
public void testStripeSkipping() throws Exception {
    try (TempFile tempFile = new TempFile()) {
        createMultiStripeFile(tempFile.getFile());
        // test reading second and fourth stripes
        OrcPredicate predicate = (numberOfRows, allColumnStatistics) -> {
            if (numberOfRows == 100) {
                return true;
            }
            IntegerStatistics stats = allColumnStatistics.get(new OrcColumnId(1)).getIntegerStatistics();
            return ((stats.getMin() == 60) && (stats.getMax() == 117)) || ((stats.getMin() == 180) && (stats.getMax() == 237));
        };
        try (OrcRecordReader reader = createCustomOrcRecordReader(tempFile, predicate, BIGINT, MAX_BATCH_SIZE)) {
            assertEquals(reader.getFileRowCount(), 100);
            assertEquals(reader.getReaderRowCount(), 40);
            assertEquals(reader.getFilePosition(), 0);
            assertEquals(reader.getReaderPosition(), 0);
            // second stripe
            Page page = reader.nextPage().getLoadedPage();
            assertEquals(page.getPositionCount(), 20);
            assertEquals(reader.getReaderPosition(), 0);
            assertEquals(reader.getFilePosition(), 20);
            assertCurrentBatch(page, 1);
            // fourth stripe
            page = reader.nextPage().getLoadedPage();
            assertEquals(page.getPositionCount(), 20);
            assertEquals(reader.getReaderPosition(), 20);
            assertEquals(reader.getFilePosition(), 60);
            assertCurrentBatch(page, 3);
            page = reader.nextPage();
            assertNull(page);
            assertEquals(reader.getReaderPosition(), 40);
            assertEquals(reader.getFilePosition(), 100);
        }
    }
}
Also used : OrcFile(org.apache.hadoop.hive.ql.io.orc.OrcFile) Slice(io.airlift.slice.Slice) Assert.assertNull(org.testng.Assert.assertNull) IntegerStatistics(io.trino.orc.metadata.statistics.IntegerStatistics) Page(io.trino.spi.Page) Assert.assertEquals(org.testng.Assert.assertEquals) Writable(org.apache.hadoop.io.Writable) Test(org.testng.annotations.Test) OrcSerde(org.apache.hadoop.hive.ql.io.orc.OrcSerde) ByteBuffer(java.nio.ByteBuffer) Writer(org.apache.hadoop.hive.ql.io.orc.Writer) BATCH_SIZE_GROWTH_FACTOR(io.trino.orc.OrcReader.BATCH_SIZE_GROWTH_FACTOR) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) VARCHAR(io.trino.spi.type.VarcharType.VARCHAR) Block(io.trino.spi.block.Block) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) READER_OPTIONS(io.trino.orc.OrcTester.READER_OPTIONS) ORC_12(io.trino.orc.OrcTester.Format.ORC_12) NullMemoryManager(org.apache.orc.NullMemoryManager) ImmutableMap(com.google.common.collect.ImmutableMap) UTF_8(java.nio.charset.StandardCharsets.UTF_8) SNAPPY(org.apache.hadoop.hive.ql.io.orc.CompressionKind.SNAPPY) INITIAL_BATCH_SIZE(io.trino.orc.OrcReader.INITIAL_BATCH_SIZE) Assert.fail(org.testng.Assert.fail) IOException(java.io.IOException) OrcTester.createCustomOrcRecordReader(io.trino.orc.OrcTester.createCustomOrcRecordReader) OrcTester.createSettableStructObjectInspector(io.trino.orc.OrcTester.createSettableStructObjectInspector) Math.min(java.lang.Math.min) Field(java.lang.reflect.Field) Maps(com.google.common.collect.Maps) CompressionKind(io.trino.orc.metadata.CompressionKind) File(java.io.File) Footer(io.trino.orc.metadata.Footer) SettableStructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.SettableStructObjectInspector) MAX_BATCH_SIZE(io.trino.orc.OrcReader.MAX_BATCH_SIZE) BIGINT(io.trino.spi.type.BigintType.BIGINT) Serializer(org.apache.hadoop.hive.serde2.Serializer) OrcTester.createOrcRecordWriter(io.trino.orc.OrcTester.createOrcRecordWriter) Assert.assertTrue(org.testng.Assert.assertTrue) OrcOutputFormat(org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat) SerDeException(org.apache.hadoop.hive.serde2.SerDeException) OrcColumnId(io.trino.orc.metadata.OrcColumnId) StructField(org.apache.hadoop.hive.serde2.objectinspector.StructField) OrcColumnId(io.trino.orc.metadata.OrcColumnId) Page(io.trino.spi.Page) OrcTester.createCustomOrcRecordReader(io.trino.orc.OrcTester.createCustomOrcRecordReader) IntegerStatistics(io.trino.orc.metadata.statistics.IntegerStatistics) Test(org.testng.annotations.Test)

Example 7 with OrcColumnId

use of io.trino.orc.metadata.OrcColumnId in project trino by trinodb.

the class StructColumnWriter method finishRowGroup.

@Override
public Map<OrcColumnId, ColumnStatistics> finishRowGroup() {
    checkState(!closed);
    ColumnStatistics statistics = new ColumnStatistics((long) nonNullValueCount, 0, null, null, null, null, null, null, null, null, null);
    rowGroupColumnStatistics.add(statistics);
    nonNullValueCount = 0;
    ImmutableMap.Builder<OrcColumnId, ColumnStatistics> columnStatistics = ImmutableMap.builder();
    columnStatistics.put(columnId, statistics);
    structFields.stream().map(ColumnWriter::finishRowGroup).forEach(columnStatistics::putAll);
    return columnStatistics.buildOrThrow();
}
Also used : ColumnStatistics(io.trino.orc.metadata.statistics.ColumnStatistics) OrcColumnId(io.trino.orc.metadata.OrcColumnId) ImmutableMap(com.google.common.collect.ImmutableMap)

Example 8 with OrcColumnId

use of io.trino.orc.metadata.OrcColumnId in project trino by trinodb.

the class TestBooleanStream method testWriteMultiple.

@Test
public void testWriteMultiple() throws IOException {
    BooleanOutputStream outputStream = createValueOutputStream();
    for (int i = 0; i < 3; i++) {
        outputStream.reset();
        BooleanList expectedValues = new BooleanArrayList(1024);
        outputStream.writeBooleans(32, true);
        expectedValues.addAll(Collections.nCopies(32, true));
        outputStream.writeBooleans(32, false);
        expectedValues.addAll(Collections.nCopies(32, false));
        outputStream.writeBooleans(1, true);
        expectedValues.add(true);
        outputStream.writeBooleans(1, false);
        expectedValues.add(false);
        outputStream.writeBooleans(34, true);
        expectedValues.addAll(Collections.nCopies(34, true));
        outputStream.writeBooleans(34, false);
        expectedValues.addAll(Collections.nCopies(34, false));
        outputStream.writeBoolean(true);
        expectedValues.add(true);
        outputStream.writeBoolean(false);
        expectedValues.add(false);
        outputStream.close();
        DynamicSliceOutput sliceOutput = new DynamicSliceOutput(1000);
        StreamDataOutput streamDataOutput = outputStream.getStreamDataOutput(new OrcColumnId(33));
        streamDataOutput.writeData(sliceOutput);
        Stream stream = streamDataOutput.getStream();
        assertEquals(stream.getStreamKind(), StreamKind.DATA);
        assertEquals(stream.getColumnId(), new OrcColumnId(33));
        assertEquals(stream.getLength(), sliceOutput.size());
        BooleanInputStream valueStream = createValueStream(sliceOutput.slice());
        for (int index = 0; index < expectedValues.size(); index++) {
            boolean expectedValue = expectedValues.getBoolean(index);
            boolean actualValue = readValue(valueStream);
            assertEquals(actualValue, expectedValue);
        }
    }
}
Also used : BooleanList(it.unimi.dsi.fastutil.booleans.BooleanList) OrcColumnId(io.trino.orc.metadata.OrcColumnId) BooleanArrayList(it.unimi.dsi.fastutil.booleans.BooleanArrayList) DynamicSliceOutput(io.airlift.slice.DynamicSliceOutput) Stream(io.trino.orc.metadata.Stream) BooleanStreamCheckpoint(io.trino.orc.checkpoint.BooleanStreamCheckpoint) Test(org.testng.annotations.Test)

Example 9 with OrcColumnId

use of io.trino.orc.metadata.OrcColumnId in project trino by trinodb.

the class AbstractTestValueStream method testWriteValue.

protected void testWriteValue(List<List<T>> groups) throws IOException {
    W outputStream = createValueOutputStream();
    for (int i = 0; i < 3; i++) {
        outputStream.reset();
        long retainedBytes = 0;
        for (List<T> group : groups) {
            outputStream.recordCheckpoint();
            group.forEach(value -> writeValue(outputStream, value));
            assertTrue(outputStream.getRetainedBytes() >= retainedBytes);
            retainedBytes = outputStream.getRetainedBytes();
        }
        outputStream.close();
        DynamicSliceOutput sliceOutput = new DynamicSliceOutput(1000);
        StreamDataOutput streamDataOutput = outputStream.getStreamDataOutput(new OrcColumnId(33));
        streamDataOutput.writeData(sliceOutput);
        Stream stream = streamDataOutput.getStream();
        assertEquals(stream.getStreamKind(), StreamKind.DATA);
        assertEquals(stream.getColumnId(), new OrcColumnId(33));
        assertEquals(stream.getLength(), sliceOutput.size());
        List<C> checkpoints = outputStream.getCheckpoints();
        assertEquals(checkpoints.size(), groups.size());
        R valueStream = createValueStream(sliceOutput.slice());
        for (List<T> group : groups) {
            int index = 0;
            for (T expectedValue : group) {
                index++;
                T actualValue = readValue(valueStream);
                if (!actualValue.equals(expectedValue)) {
                    assertEquals(actualValue, expectedValue, "index=" + index);
                }
            }
        }
        for (int groupIndex = groups.size() - 1; groupIndex >= 0; groupIndex--) {
            valueStream.seekToCheckpoint(checkpoints.get(groupIndex));
            for (T expectedValue : groups.get(groupIndex)) {
                T actualValue = readValue(valueStream);
                if (!actualValue.equals(expectedValue)) {
                    assertEquals(actualValue, expectedValue);
                }
            }
        }
    }
}
Also used : OrcColumnId(io.trino.orc.metadata.OrcColumnId) DynamicSliceOutput(io.airlift.slice.DynamicSliceOutput) Stream(io.trino.orc.metadata.Stream) StreamCheckpoint(io.trino.orc.checkpoint.StreamCheckpoint)

Example 10 with OrcColumnId

use of io.trino.orc.metadata.OrcColumnId in project trino by trinodb.

the class StripeReader method readStripe.

public Stripe readStripe(StripeInformation stripe, AggregatedMemoryContext memoryUsage) throws IOException {
    // read the stripe footer
    StripeFooter stripeFooter = readStripeFooter(stripe, memoryUsage);
    ColumnMetadata<ColumnEncoding> columnEncodings = stripeFooter.getColumnEncodings();
    if (writeValidation.isPresent()) {
        writeValidation.get().validateTimeZone(orcDataSource.getId(), stripeFooter.getTimeZone());
    }
    ZoneId fileTimeZone = stripeFooter.getTimeZone();
    // get streams for selected columns
    Map<StreamId, Stream> streams = new HashMap<>();
    for (Stream stream : stripeFooter.getStreams()) {
        if (includedOrcColumnIds.contains(stream.getColumnId()) && isSupportedStreamType(stream, types.get(stream.getColumnId()).getOrcTypeKind())) {
            streams.put(new StreamId(stream), stream);
        }
    }
    // handle stripes with more than one row group
    boolean invalidCheckPoint = false;
    if (rowsInRowGroup.isPresent() && stripe.getNumberOfRows() > rowsInRowGroup.getAsInt()) {
        // determine ranges of the stripe to read
        Map<StreamId, DiskRange> diskRanges = getDiskRanges(stripeFooter.getStreams());
        diskRanges = Maps.filterKeys(diskRanges, Predicates.in(streams.keySet()));
        // read the file regions
        Map<StreamId, OrcChunkLoader> streamsData = readDiskRanges(stripe.getOffset(), diskRanges, memoryUsage);
        // read the bloom filter for each column
        Map<OrcColumnId, List<BloomFilter>> bloomFilterIndexes = readBloomFilterIndexes(streams, streamsData);
        // read the row index for each column
        Map<StreamId, List<RowGroupIndex>> columnIndexes = readColumnIndexes(streams, streamsData, bloomFilterIndexes);
        if (writeValidation.isPresent()) {
            writeValidation.get().validateRowGroupStatistics(orcDataSource.getId(), stripe.getOffset(), columnIndexes);
        }
        // select the row groups matching the tuple domain
        Set<Integer> selectedRowGroups = selectRowGroups(stripe, columnIndexes);
        // if all row groups are skipped, return null
        if (selectedRowGroups.isEmpty()) {
            // set accounted memory usage to zero
            memoryUsage.close();
            return null;
        }
        // value streams
        Map<StreamId, ValueInputStream<?>> valueStreams = createValueStreams(streams, streamsData, columnEncodings);
        // build the dictionary streams
        InputStreamSources dictionaryStreamSources = createDictionaryStreamSources(streams, valueStreams, columnEncodings);
        // build the row groups
        try {
            List<RowGroup> rowGroups = createRowGroups(stripe.getNumberOfRows(), streams, valueStreams, columnIndexes, selectedRowGroups, columnEncodings);
            return new Stripe(stripe.getNumberOfRows(), fileTimeZone, columnEncodings, rowGroups, dictionaryStreamSources);
        } catch (InvalidCheckpointException e) {
            // The ORC file contains a corrupt checkpoint stream treat the stripe as a single row group.
            invalidCheckPoint = true;
        }
    }
    // stripe only has one row group
    ImmutableMap.Builder<StreamId, DiskRange> diskRangesBuilder = ImmutableMap.builder();
    for (Entry<StreamId, DiskRange> entry : getDiskRanges(stripeFooter.getStreams()).entrySet()) {
        StreamId streamId = entry.getKey();
        if (streams.containsKey(streamId)) {
            diskRangesBuilder.put(entry);
        }
    }
    ImmutableMap<StreamId, DiskRange> diskRanges = diskRangesBuilder.buildOrThrow();
    // read the file regions
    Map<StreamId, OrcChunkLoader> streamsData = readDiskRanges(stripe.getOffset(), diskRanges, memoryUsage);
    long minAverageRowBytes = 0;
    for (Entry<StreamId, Stream> entry : streams.entrySet()) {
        if (entry.getKey().getStreamKind() == ROW_INDEX) {
            List<RowGroupIndex> rowGroupIndexes = metadataReader.readRowIndexes(hiveWriterVersion, new OrcInputStream(streamsData.get(entry.getKey())));
            checkState(rowGroupIndexes.size() == 1 || invalidCheckPoint, "expect a single row group or an invalid check point");
            long totalBytes = 0;
            long totalRows = 0;
            for (RowGroupIndex rowGroupIndex : rowGroupIndexes) {
                ColumnStatistics columnStatistics = rowGroupIndex.getColumnStatistics();
                if (columnStatistics.hasMinAverageValueSizeInBytes()) {
                    totalBytes += columnStatistics.getMinAverageValueSizeInBytes() * columnStatistics.getNumberOfValues();
                    totalRows += columnStatistics.getNumberOfValues();
                }
            }
            if (totalRows > 0) {
                minAverageRowBytes += totalBytes / totalRows;
            }
        }
    }
    // value streams
    Map<StreamId, ValueInputStream<?>> valueStreams = createValueStreams(streams, streamsData, columnEncodings);
    // build the dictionary streams
    InputStreamSources dictionaryStreamSources = createDictionaryStreamSources(streams, valueStreams, columnEncodings);
    // build the row group
    ImmutableMap.Builder<StreamId, InputStreamSource<?>> builder = ImmutableMap.builder();
    for (Entry<StreamId, ValueInputStream<?>> entry : valueStreams.entrySet()) {
        builder.put(entry.getKey(), new ValueInputStreamSource<>(entry.getValue()));
    }
    RowGroup rowGroup = new RowGroup(0, 0, stripe.getNumberOfRows(), minAverageRowBytes, new InputStreamSources(builder.buildOrThrow()));
    return new Stripe(stripe.getNumberOfRows(), fileTimeZone, columnEncodings, ImmutableList.of(rowGroup), dictionaryStreamSources);
}
Also used : ValueInputStream(io.trino.orc.stream.ValueInputStream) OrcColumnId(io.trino.orc.metadata.OrcColumnId) HashMap(java.util.HashMap) InvalidCheckpointException(io.trino.orc.checkpoint.InvalidCheckpointException) ValueInputStreamSource(io.trino.orc.stream.ValueInputStreamSource) InputStreamSource(io.trino.orc.stream.InputStreamSource) Stream(io.trino.orc.metadata.Stream) OrcInputStream(io.trino.orc.stream.OrcInputStream) ValueInputStream(io.trino.orc.stream.ValueInputStream) InputStream(java.io.InputStream) List(java.util.List) ArrayList(java.util.ArrayList) ImmutableList(com.google.common.collect.ImmutableList) ColumnStatistics(io.trino.orc.metadata.statistics.ColumnStatistics) OrcInputStream(io.trino.orc.stream.OrcInputStream) ZoneId(java.time.ZoneId) OrcChunkLoader(io.trino.orc.stream.OrcChunkLoader) ImmutableMap(com.google.common.collect.ImmutableMap) ImmutableMap.toImmutableMap(com.google.common.collect.ImmutableMap.toImmutableMap) ColumnEncoding(io.trino.orc.metadata.ColumnEncoding) InputStreamSources(io.trino.orc.stream.InputStreamSources) StripeFooter(io.trino.orc.metadata.StripeFooter) RowGroupIndex(io.trino.orc.metadata.RowGroupIndex)

Aggregations

OrcColumnId (io.trino.orc.metadata.OrcColumnId)24 ImmutableMap (com.google.common.collect.ImmutableMap)10 Stream (io.trino.orc.metadata.Stream)9 ColumnStatistics (io.trino.orc.metadata.statistics.ColumnStatistics)9 ArrayList (java.util.ArrayList)9 OrcType (io.trino.orc.metadata.OrcType)8 List (java.util.List)8 ImmutableList (com.google.common.collect.ImmutableList)7 Slice (io.airlift.slice.Slice)5 CompressionKind (io.trino.orc.metadata.CompressionKind)5 Map (java.util.Map)5 ImmutableList.toImmutableList (com.google.common.collect.ImmutableList.toImmutableList)4 Footer (io.trino.orc.metadata.Footer)4 OrcInputStream (io.trino.orc.stream.OrcInputStream)4 Page (io.trino.spi.Page)4 IOException (java.io.IOException)4 InputStream (java.io.InputStream)4 ByteBuffer (java.nio.ByteBuffer)3 Configuration (org.apache.hadoop.conf.Configuration)3 Path (org.apache.hadoop.fs.Path)3