Search in sources :

Example 6 with InputStreamSources

use of io.prestosql.orc.stream.InputStreamSources in project hetu-core by openlookeng.

the class StripeReader method createDictionaryStreamSources.

private InputStreamSources createDictionaryStreamSources(Map<StreamId, Stream> streams, Map<StreamId, ValueInputStream<?>> valueStreams, ColumnMetadata<ColumnEncoding> columnEncodings) {
    ImmutableMap.Builder<StreamId, InputStreamSource<?>> dictionaryStreamBuilder = ImmutableMap.builder();
    for (Entry<StreamId, Stream> entry : streams.entrySet()) {
        StreamId streamId = entry.getKey();
        Stream stream = entry.getValue();
        OrcColumnId column = stream.getColumnId();
        // only process dictionary streams
        ColumnEncodingKind columnEncoding = columnEncodings.get(column).getColumnEncodingKind();
        if (!isDictionary(stream, columnEncoding)) {
            continue;
        }
        // skip streams without data
        ValueInputStream<?> valueStream = valueStreams.get(streamId);
        if (valueStream == null) {
            continue;
        }
        OrcTypeKind columnType = types.get(stream.getColumnId()).getOrcTypeKind();
        StreamCheckpoint streamCheckpoint = getDictionaryStreamCheckpoint(streamId, columnType, columnEncoding);
        InputStreamSource<?> streamSource = createCheckpointStreamSource(valueStream, streamCheckpoint);
        dictionaryStreamBuilder.put(streamId, streamSource);
    }
    return new InputStreamSources(dictionaryStreamBuilder.build());
}
Also used : OrcColumnId(io.prestosql.orc.metadata.OrcColumnId) OrcTypeKind(io.prestosql.orc.metadata.OrcType.OrcTypeKind) ImmutableMap(com.google.common.collect.ImmutableMap) ImmutableMap.toImmutableMap(com.google.common.collect.ImmutableMap.toImmutableMap) ValueInputStreamSource(io.prestosql.orc.stream.ValueInputStreamSource) InputStreamSource(io.prestosql.orc.stream.InputStreamSource) InputStreamSources(io.prestosql.orc.stream.InputStreamSources) OrcInputStream(io.prestosql.orc.stream.OrcInputStream) ValueInputStream(io.prestosql.orc.stream.ValueInputStream) Stream(io.prestosql.orc.metadata.Stream) InputStream(java.io.InputStream) ColumnEncodingKind(io.prestosql.orc.metadata.ColumnEncoding.ColumnEncodingKind) Checkpoints.getDictionaryStreamCheckpoint(io.prestosql.orc.checkpoint.Checkpoints.getDictionaryStreamCheckpoint) StreamCheckpoint(io.prestosql.orc.checkpoint.StreamCheckpoint)

Example 7 with InputStreamSources

use of io.prestosql.orc.stream.InputStreamSources in project hetu-core by openlookeng.

the class StripeReader method createRowGroup.

private static RowGroup createRowGroup(int groupId, int rowOffset, int rowCount, long minAverageRowBytes, Map<StreamId, ValueInputStream<?>> valueStreams, Map<StreamId, StreamCheckpoint> checkpoints) {
    ImmutableMap.Builder<StreamId, InputStreamSource<?>> builder = ImmutableMap.builder();
    for (Entry<StreamId, StreamCheckpoint> entry : checkpoints.entrySet()) {
        StreamId streamId = entry.getKey();
        StreamCheckpoint checkpoint = entry.getValue();
        // skip streams without data
        ValueInputStream<?> valueStream = valueStreams.get(streamId);
        if (valueStream == null) {
            continue;
        }
        builder.put(streamId, createCheckpointStreamSource(valueStream, checkpoint));
    }
    InputStreamSources rowGroupStreams = new InputStreamSources(builder.build());
    return new RowGroup(groupId, rowOffset, rowCount, minAverageRowBytes, rowGroupStreams);
}
Also used : ValueInputStreamSource(io.prestosql.orc.stream.ValueInputStreamSource) InputStreamSource(io.prestosql.orc.stream.InputStreamSource) InputStreamSources(io.prestosql.orc.stream.InputStreamSources) ImmutableMap(com.google.common.collect.ImmutableMap) ImmutableMap.toImmutableMap(com.google.common.collect.ImmutableMap.toImmutableMap) Checkpoints.getDictionaryStreamCheckpoint(io.prestosql.orc.checkpoint.Checkpoints.getDictionaryStreamCheckpoint) StreamCheckpoint(io.prestosql.orc.checkpoint.StreamCheckpoint)

Example 8 with InputStreamSources

use of io.prestosql.orc.stream.InputStreamSources in project hetu-core by openlookeng.

the class StripeReader method readStripe.

public Stripe readStripe(StripeInformation stripe, AggregatedMemoryContext systemMemoryUsage) throws IOException {
    // read the stripe footer
    OrcStripeFooterCacheKey cacheKey = new OrcStripeFooterCacheKey();
    cacheKey.setOrcDataSourceId(new OrcDataSourceIdWithTimeStamp(orcDataSource.getId(), orcDataSource.getLastModifiedTime()));
    cacheKey.setStripeOffset(stripe.getOffset());
    StripeFooter stripeFooter;
    if (orcCacheProperties.isStripeFooterCacheEnabled()) {
        try {
            stripeFooter = orcCacheStore.getStripeFooterCache().get(cacheKey, () -> this.readStripeFooter(stripe, systemMemoryUsage));
        } catch (UncheckedExecutionException | ExecutionException executionException) {
            handleCacheLoadException(executionException);
            log.debug(executionException.getCause(), "Error while caching ORC stripe footer. Falling back to default flow");
            stripeFooter = readStripeFooter(stripe, systemMemoryUsage);
        }
    } else {
        stripeFooter = readStripeFooter(stripe, systemMemoryUsage);
    }
    ColumnMetadata<ColumnEncoding> columnEncodings = stripeFooter.getColumnEncodings();
    if (writeValidation.isPresent()) {
        writeValidation.get().validateTimeZone(orcDataSource.getId(), stripeFooter.getTimeZone());
    }
    ZoneId fileTimeZone = stripeFooter.getTimeZone();
    // get streams for selected columns
    Map<StreamId, Stream> streams = new HashMap<>();
    for (Stream stream : stripeFooter.getStreams()) {
        if (includedOrcColumnIds.contains(stream.getColumnId()) && isSupportedStreamType(stream, types.get(stream.getColumnId()).getOrcTypeKind())) {
            streams.put(new StreamId(stream), stream);
        }
    }
    // handle stripes with more than one row group
    boolean invalidCheckPoint = false;
    if (stripe.getNumberOfRows() > rowsInRowGroup) {
        // determine ranges of the stripe to read
        Map<StreamId, DiskRange> diskRanges = getDiskRanges(stripeFooter.getStreams());
        diskRanges = Maps.filterKeys(diskRanges, Predicates.in(streams.keySet()));
        // read the file regions
        Map<StreamId, OrcChunkLoader> streamsData = readDiskRanges(stripe.getOffset(), diskRanges, systemMemoryUsage);
        // read the bloom filter for each column
        Map<OrcColumnId, List<HashableBloomFilter>> bloomFilterIndexes = readBloomFilterIndexes(streams, streamsData, stripe);
        // read the row index for each column
        Map<StreamId, List<RowGroupIndex>> columnIndexes = readColumnIndexes(streams, streamsData, bloomFilterIndexes, stripe);
        if (writeValidation.isPresent()) {
            writeValidation.get().validateRowGroupStatistics(orcDataSource.getId(), stripe.getOffset(), columnIndexes);
        }
        // select the row groups matching the tuple domain
        Set<Integer> selectedRowGroups = selectRowGroups(stripe, columnIndexes);
        // if all row groups are skipped, return null
        if (selectedRowGroups.isEmpty()) {
            // set accounted memory usage to zero
            systemMemoryUsage.close();
            return null;
        }
        // value streams
        Map<StreamId, ValueInputStream<?>> valueStreams = createValueStreams(streams, streamsData, columnEncodings);
        // build the dictionary streams
        InputStreamSources dictionaryStreamSources = createDictionaryStreamSources(streams, valueStreams, columnEncodings);
        // build the row groups
        try {
            List<RowGroup> rowGroups = createRowGroups(stripe.getNumberOfRows(), streams, valueStreams, columnIndexes, selectedRowGroups, columnEncodings);
            return new Stripe(stripe.getNumberOfRows(), fileTimeZone, columnEncodings, rowGroups, dictionaryStreamSources);
        } catch (InvalidCheckpointException e) {
            // The ORC file contains a corrupt checkpoint stream treat the stripe as a single row group.
            invalidCheckPoint = true;
        }
    }
    // stripe only has one row group
    ImmutableMap.Builder<StreamId, DiskRange> diskRangesBuilder = ImmutableMap.builder();
    for (Entry<StreamId, DiskRange> entry : getDiskRanges(stripeFooter.getStreams()).entrySet()) {
        StreamId streamId = entry.getKey();
        if (streams.containsKey(streamId)) {
            diskRangesBuilder.put(entry);
        }
    }
    ImmutableMap<StreamId, DiskRange> diskRanges = diskRangesBuilder.build();
    // read the file regions
    Map<StreamId, OrcChunkLoader> streamsData = readDiskRanges(stripe.getOffset(), diskRanges, systemMemoryUsage);
    long minAverageRowBytes = 0;
    for (Entry<StreamId, Stream> entry : streams.entrySet()) {
        if (entry.getKey().getStreamKind() == ROW_INDEX) {
            List<RowGroupIndex> rowGroupIndexes;
            if (orcCacheProperties.isRowIndexCacheEnabled()) {
                OrcRowIndexCacheKey indexCacheKey = new OrcRowIndexCacheKey();
                indexCacheKey.setOrcDataSourceId(new OrcDataSourceIdWithTimeStamp(orcDataSource.getId(), orcDataSource.getLastModifiedTime()));
                indexCacheKey.setStripeOffset(stripe.getOffset());
                indexCacheKey.setStreamId(entry.getKey());
                try {
                    rowGroupIndexes = orcCacheStore.getRowIndexCache().get(indexCacheKey, () -> metadataReader.readRowIndexes(hiveWriterVersion, new OrcInputStream(streamsData.get(entry.getKey()))));
                } catch (UncheckedExecutionException | ExecutionException executionException) {
                    handleCacheLoadException(executionException);
                    log.debug(executionException.getCause(), "Error while caching row group indexes. Falling back to default flow");
                    rowGroupIndexes = metadataReader.readRowIndexes(hiveWriterVersion, new OrcInputStream(streamsData.get(entry.getKey())));
                }
            } else {
                rowGroupIndexes = metadataReader.readRowIndexes(hiveWriterVersion, new OrcInputStream(streamsData.get(entry.getKey())));
            }
            checkState(rowGroupIndexes.size() == 1 || invalidCheckPoint, "expect a single row group or an invalid check point");
            long totalBytes = 0;
            long totalRows = 0;
            for (RowGroupIndex rowGroupIndex : rowGroupIndexes) {
                ColumnStatistics columnStatistics = rowGroupIndex.getColumnStatistics();
                if (columnStatistics.hasMinAverageValueSizeInBytes()) {
                    totalBytes += columnStatistics.getMinAverageValueSizeInBytes() * columnStatistics.getNumberOfValues();
                    totalRows += columnStatistics.getNumberOfValues();
                }
            }
            if (totalRows > 0) {
                minAverageRowBytes += totalBytes / totalRows;
            }
        }
    }
    // value streams
    Map<StreamId, ValueInputStream<?>> valueStreams = createValueStreams(streams, streamsData, columnEncodings);
    // build the dictionary streams
    InputStreamSources dictionaryStreamSources = createDictionaryStreamSources(streams, valueStreams, columnEncodings);
    // build the row group
    ImmutableMap.Builder<StreamId, InputStreamSource<?>> builder = ImmutableMap.builder();
    for (Entry<StreamId, ValueInputStream<?>> entry : valueStreams.entrySet()) {
        builder.put(entry.getKey(), new ValueInputStreamSource<>(entry.getValue()));
    }
    RowGroup rowGroup = new RowGroup(0, 0, stripe.getNumberOfRows(), minAverageRowBytes, new InputStreamSources(builder.build()));
    return new Stripe(stripe.getNumberOfRows(), fileTimeZone, columnEncodings, ImmutableList.of(rowGroup), dictionaryStreamSources);
}
Also used : ValueInputStream(io.prestosql.orc.stream.ValueInputStream) OrcColumnId(io.prestosql.orc.metadata.OrcColumnId) HashMap(java.util.HashMap) InvalidCheckpointException(io.prestosql.orc.checkpoint.InvalidCheckpointException) ValueInputStreamSource(io.prestosql.orc.stream.ValueInputStreamSource) InputStreamSource(io.prestosql.orc.stream.InputStreamSource) OrcInputStream(io.prestosql.orc.stream.OrcInputStream) ValueInputStream(io.prestosql.orc.stream.ValueInputStream) Stream(io.prestosql.orc.metadata.Stream) InputStream(java.io.InputStream) List(java.util.List) ArrayList(java.util.ArrayList) ImmutableList(com.google.common.collect.ImmutableList) UncheckedExecutionException(com.google.common.util.concurrent.UncheckedExecutionException) ExecutionException(java.util.concurrent.ExecutionException) ColumnStatistics(io.prestosql.orc.metadata.statistics.ColumnStatistics) UncheckedExecutionException(com.google.common.util.concurrent.UncheckedExecutionException) OrcInputStream(io.prestosql.orc.stream.OrcInputStream) ZoneId(java.time.ZoneId) OrcChunkLoader(io.prestosql.orc.stream.OrcChunkLoader) ImmutableMap(com.google.common.collect.ImmutableMap) ImmutableMap.toImmutableMap(com.google.common.collect.ImmutableMap.toImmutableMap) ColumnEncoding(io.prestosql.orc.metadata.ColumnEncoding) InputStreamSources(io.prestosql.orc.stream.InputStreamSources) StripeFooter(io.prestosql.orc.metadata.StripeFooter) RowGroupIndex(io.prestosql.orc.metadata.RowGroupIndex)

Example 9 with InputStreamSources

use of io.prestosql.orc.stream.InputStreamSources in project hetu-core by openlookeng.

the class TestCachingColumnReader method testBlockCachedOnStartStripe.

@Test
public void testBlockCachedOnStartStripe() throws IOException {
    ColumnReader streamReader = mock(ColumnReader.class);
    Cache<OrcRowDataCacheKey, Block> cache = spy(CacheBuilder.newBuilder().build());
    CachingColumnReader cachingColumnReader = new CachingColumnReader(streamReader, column, cache);
    InputStreamSources inputStreamSources = mock(InputStreamSources.class);
    Stripe stripe = mock(Stripe.class);
    ZoneId fileTimeZone = stripe.getFileTimeZone();
    ColumnMetadata<ColumnEncoding> columnEncodings = stripe.getColumnEncodings();
    cachingColumnReader.startStripe(fileTimeZone, inputStreamSources, columnEncodings);
    verify(streamReader, atLeastOnce()).startStripe(eq(fileTimeZone), eq(inputStreamSources), eq(columnEncodings));
}
Also used : ColumnEncoding(io.prestosql.orc.metadata.ColumnEncoding) InputStreamSources(io.prestosql.orc.stream.InputStreamSources) ZoneId(java.time.ZoneId) Block(io.prestosql.spi.block.Block) CachingColumnReader(io.prestosql.orc.reader.CachingColumnReader) ColumnReader(io.prestosql.orc.reader.ColumnReader) CachingColumnReader(io.prestosql.orc.reader.CachingColumnReader) Test(org.testng.annotations.Test)

Example 10 with InputStreamSources

use of io.prestosql.orc.stream.InputStreamSources in project hetu-core by openlookeng.

the class TestCachingColumnReader method testCacheLoaderThrowsInterruptedException.

@Test(expectedExceptions = PrestoException.class, expectedExceptionsMessageRegExp = ".*Read interrupted.*")
public void testCacheLoaderThrowsInterruptedException() throws IOException {
    ColumnReader columnReader = mock(ColumnReader.class);
    Cache<OrcRowDataCacheKey, Block> cache = spy(CacheBuilder.newBuilder().build());
    CachingColumnReader cachingColumnReader = new CachingColumnReader(columnReader, column, cache);
    InputStreamSources inputStreamSources = mock(InputStreamSources.class);
    StreamSourceMeta streamSourceMeta = new StreamSourceMeta();
    OrcDataSourceId orcDataSourceId = new OrcDataSourceId("2");
    streamSourceMeta.setDataSourceId(orcDataSourceId);
    when(inputStreamSources.getStreamSourceMeta()).thenReturn(streamSourceMeta);
    when(columnReader.readBlock()).then((Answer<Block>) invocationOnMock -> {
        Thread.currentThread().interrupt();
        throw new PrestoException(StandardErrorCode.GENERIC_INTERNAL_ERROR, "Read interrupted");
    });
    try {
        cachingColumnReader.startRowGroup(inputStreamSources);
    } catch (IOException ioEx) {
        verify(columnReader, atLeastOnce()).startRowGroup(eq(inputStreamSources));
        verify(columnReader, times(1)).readBlock();
        assertEquals(cache.size(), 0);
        throw ioEx;
    } finally {
        // clear interrupted flag status
        Thread.interrupted();
    }
}
Also used : Assert.assertEquals(org.testng.Assert.assertEquals) Test(org.testng.annotations.Test) Callable(java.util.concurrent.Callable) OrcColumnId(io.prestosql.orc.metadata.OrcColumnId) CachingColumnReader(io.prestosql.orc.reader.CachingColumnReader) Mockito.spy(org.mockito.Mockito.spy) Answer(org.mockito.stubbing.Answer) ImmutableList(com.google.common.collect.ImmutableList) InputStreamSources(io.prestosql.orc.stream.InputStreamSources) StreamSourceMeta(io.prestosql.orc.stream.StreamSourceMeta) Block(io.prestosql.spi.block.Block) PrestoException(io.prestosql.spi.PrestoException) InOrder(org.mockito.InOrder) ColumnEncoding(io.prestosql.orc.metadata.ColumnEncoding) OrcType(io.prestosql.orc.metadata.OrcType) Mockito.atLeastOnce(org.mockito.Mockito.atLeastOnce) IOException(java.io.IOException) Mockito.times(org.mockito.Mockito.times) Mockito.when(org.mockito.Mockito.when) ColumnMetadata(io.prestosql.orc.metadata.ColumnMetadata) ZoneId(java.time.ZoneId) ColumnReader(io.prestosql.orc.reader.ColumnReader) Mockito.verify(org.mockito.Mockito.verify) Matchers.any(org.mockito.Matchers.any) Mockito.inOrder(org.mockito.Mockito.inOrder) CacheBuilder(com.google.common.cache.CacheBuilder) Cache(com.google.common.cache.Cache) StandardErrorCode(io.prestosql.spi.StandardErrorCode) Mockito.eq(org.mockito.Mockito.eq) Mockito.mock(org.mockito.Mockito.mock) InputStreamSources(io.prestosql.orc.stream.InputStreamSources) Block(io.prestosql.spi.block.Block) PrestoException(io.prestosql.spi.PrestoException) CachingColumnReader(io.prestosql.orc.reader.CachingColumnReader) ColumnReader(io.prestosql.orc.reader.ColumnReader) StreamSourceMeta(io.prestosql.orc.stream.StreamSourceMeta) IOException(java.io.IOException) CachingColumnReader(io.prestosql.orc.reader.CachingColumnReader) Test(org.testng.annotations.Test)

Aggregations

InputStreamSources (io.prestosql.orc.stream.InputStreamSources)10 CachingColumnReader (io.prestosql.orc.reader.CachingColumnReader)6 ColumnReader (io.prestosql.orc.reader.ColumnReader)5 StreamSourceMeta (io.prestosql.orc.stream.StreamSourceMeta)5 Block (io.prestosql.spi.block.Block)5 Test (org.testng.annotations.Test)5 ColumnEncoding (io.prestosql.orc.metadata.ColumnEncoding)4 ZoneId (java.time.ZoneId)4 ImmutableMap (com.google.common.collect.ImmutableMap)3 ImmutableMap.toImmutableMap (com.google.common.collect.ImmutableMap.toImmutableMap)3 OrcColumnId (io.prestosql.orc.metadata.OrcColumnId)3 InputStreamSource (io.prestosql.orc.stream.InputStreamSource)3 ValueInputStreamSource (io.prestosql.orc.stream.ValueInputStreamSource)3 Callable (java.util.concurrent.Callable)3 ImmutableList (com.google.common.collect.ImmutableList)2 Checkpoints.getDictionaryStreamCheckpoint (io.prestosql.orc.checkpoint.Checkpoints.getDictionaryStreamCheckpoint)2 StreamCheckpoint (io.prestosql.orc.checkpoint.StreamCheckpoint)2 Stream (io.prestosql.orc.metadata.Stream)2 AbstractColumnReader (io.prestosql.orc.reader.AbstractColumnReader)2 OrcInputStream (io.prestosql.orc.stream.OrcInputStream)2