Search in sources :

Example 1 with StripeInformation

use of io.prestosql.orc.metadata.StripeInformation in project hetu-core by openlookeng.

the class OrcRecordReader method filterRows.

private Block filterRows(Block block) {
    // currentPosition to currentBatchSize
    StripeInformation stripe = stripes.get(currentStripe);
    if (matchingRowsInBatchArray == null && stripeMatchingRows.containsKey(stripe) && block.getPositionCount() != 0) {
        long currentPositionInStripe = currentPosition - currentStripePosition;
        PeekingIterator<Integer> matchingRows = stripeMatchingRows.get(stripe);
        List<Integer> matchingRowsInBlock = new ArrayList<>();
        while (matchingRows.hasNext()) {
            Integer row = matchingRows.peek();
            if (row < currentPositionInStripe) {
                // this can happen if a row group containing matching rows was filtered out
                // for example, if matchingRows is for column1 but query is for column1 and column2.
                // since row groups have minmax values, a row group could have been filtered out because of
                // column2 predicate. this means that the current matchingRow could be 10 (within the first
                // row group), but the first row group might've been filtered out due to column2 predicate,
                // so currentPositionInStripe is already in second row group
                // 
                // stripe 1
                // -> row group 1 (rows 1 to 10000) [filtered out due to column2 predicate]
                // 1
                // 2
                // ...
                // 10     <- matchingRows cursor is here, but this row group has been filtered out
                // ...
                // 10000
                // -> row group 2 (rows 10001 to 20000)
                // 10001
                // 10002   <- currentPositionInStripe is here
                // ...
                // 20000
                matchingRows.next();
            } else if (row < currentPositionInStripe + currentBatchSize) {
                // matchingRows cursor is within current batch
                matchingRowsInBlock.add(toIntExact(Long.valueOf(row) - currentPositionInStripe));
                matchingRows.next();
            } else {
                // matchingRows cursor is ahead of current batch, next batch will use it
                break;
            }
        }
        matchingRowsInBatchArray = new int[matchingRowsInBlock.size()];
        IntStream.range(0, matchingRowsInBlock.size()).forEach(i -> matchingRowsInBatchArray[i] = matchingRowsInBlock.get(i));
    }
    if (matchingRowsInBatchArray != null) {
        return block.copyPositions(matchingRowsInBatchArray, 0, matchingRowsInBatchArray.length);
    }
    return block;
}
Also used : ArrayList(java.util.ArrayList) StripeInformation(io.prestosql.orc.metadata.StripeInformation)

Example 2 with StripeInformation

use of io.prestosql.orc.metadata.StripeInformation in project hetu-core by openlookeng.

the class OrcSelectiveRecordReader method initializePositions.

private int[] initializePositions(int batchSize) {
    // currentPosition to currentBatchSize
    StripeInformation stripe = stripes.get(currentStripe);
    if (matchingRowsInBatchArray == null && stripeMatchingRows.containsKey(stripe)) {
        long currentPositionInStripe = currentPosition - currentStripePosition;
        PeekingIterator<Integer> matchingRows = stripeMatchingRows.get(stripe);
        List<Integer> matchingRowsInBlock = new ArrayList<>();
        while (matchingRows.hasNext()) {
            Integer row = matchingRows.peek();
            if (row < currentPositionInStripe) {
                // this can happen if a row group containing matching rows was filtered out
                // for example, if matchingRows is for column1 but query is for column1 and column2.
                // since row groups have minmax values, a row group could have been filtered out because of
                // column2 predicate. this means that the current matchingRow could be 10 (within the first
                // row group), but the first row group might've been filtered out due to column2 predicate,
                // so currentPositionInStripe is already in second row group
                // 
                // stripe 1
                // -> row group 1 (rows 1 to 10000) [filtered out due to column2 predicate]
                // 1
                // 2
                // ...
                // 10     <- matchingRows cursor is here, but this row group has been filtered out
                // ...
                // 10000
                // -> row group 2 (rows 10001 to 20000)
                // 10001
                // 10002   <- currentPositionInStripe is here
                // ...
                // 20000
                matchingRows.next();
            } else if (row < currentPositionInStripe + batchSize) {
                // matchingRows cursor is within current batch
                matchingRowsInBlock.add(toIntExact(Long.valueOf(row) - currentPositionInStripe));
                matchingRows.next();
            } else {
                // matchingRows cursor is ahead of current batch, next batch will use it
                break;
            }
        }
        matchingRowsInBatchArray = new int[matchingRowsInBlock.size()];
        IntStream.range(0, matchingRowsInBlock.size()).forEach(i -> matchingRowsInBatchArray[i] = matchingRowsInBlock.get(i));
    }
    if (matchingRowsInBatchArray != null) {
        return matchingRowsInBatchArray;
    } else {
        int[] positions = new int[batchSize];
        for (int i = 0; i < batchSize; i++) {
            positions[i] = i;
        }
        return positions;
    }
}
Also used : ArrayList(java.util.ArrayList) StripeInformation(io.prestosql.orc.metadata.StripeInformation)

Example 3 with StripeInformation

use of io.prestosql.orc.metadata.StripeInformation in project hetu-core by openlookeng.

the class AbstractOrcRecordReader method advanceToNextStripe.

private void advanceToNextStripe() throws IOException {
    currentStripeSystemMemoryContext.close();
    currentStripeSystemMemoryContext = systemMemoryUsage.newAggregatedMemoryContext();
    rowGroups = ImmutableList.<RowGroup>of().iterator();
    if (currentStripe >= 0) {
        if (stripeStatisticsValidation.isPresent()) {
            OrcWriteValidation.StatisticsValidation statisticsValidation = stripeStatisticsValidation.get();
            long offset = stripes.get(currentStripe).getOffset();
            writeValidation.get().validateStripeStatistics(orcDataSource.getId(), offset, statisticsValidation.build().get());
            statisticsValidation.reset();
        }
    }
    currentStripe++;
    if (currentStripe >= stripes.size()) {
        return;
    }
    if (currentStripe > 0) {
        currentStripePosition += stripes.get(currentStripe - 1).getNumberOfRows();
    }
    StripeInformation stripeInformation = stripes.get(currentStripe);
    validateWriteStripe(stripeInformation.getNumberOfRows());
    Stripe stripe = stripeReader.readStripe(stripeInformation, currentStripeSystemMemoryContext);
    if (stripe != null) {
        // Give readers access to dictionary streams
        InputStreamSources dictionaryStreamSources = stripe.getDictionaryStreamSources();
        ColumnMetadata<ColumnEncoding> columnEncodings = stripe.getColumnEncodings();
        for (AbstractColumnReader columnReader : columnReaders) {
            if (columnReader != null) {
                ZoneId fileTimeZone = stripe.getFileTimeZone();
                columnReader.startStripe(fileTimeZone, dictionaryStreamSources, columnEncodings);
            }
        }
        rowGroups = stripe.getRowGroups().iterator();
    }
}
Also used : ColumnEncoding(io.prestosql.orc.metadata.ColumnEncoding) InputStreamSources(io.prestosql.orc.stream.InputStreamSources) ZoneId(java.time.ZoneId) AbstractColumnReader(io.prestosql.orc.reader.AbstractColumnReader) StripeInformation(io.prestosql.orc.metadata.StripeInformation)

Example 4 with StripeInformation

use of io.prestosql.orc.metadata.StripeInformation in project hetu-core by openlookeng.

the class TestCachingOrcDataSource method doIntegration.

private void doIntegration(TestingOrcDataSource orcDataSource, DataSize maxMergeDistance, DataSize tinyStripeThreshold) throws IOException {
    OrcReader orcReader = new OrcReader(orcDataSource, maxMergeDistance, tinyStripeThreshold, new DataSize(1, Unit.MEGABYTE));
    // 1 for reading file footer
    assertEquals(orcDataSource.getReadCount(), 1);
    List<StripeInformation> stripes = orcReader.getFooter().getStripes();
    // Sanity check number of stripes. This can be three or higher because of orc writer low memory mode.
    assertGreaterThanOrEqual(stripes.size(), 3);
    // verify wrapped by CachingOrcReader
    assertInstanceOf(wrapWithCacheIfTinyStripes(orcDataSource, stripes, maxMergeDistance, tinyStripeThreshold), CachingOrcDataSource.class);
    OrcRecordReader orcRecordReader = orcReader.createRecordReader(orcReader.getRootColumn().getNestedColumns(), ImmutableList.of(VARCHAR), (numberOfRows, statisticsByColumnIndex) -> true, HIVE_STORAGE_TIME_ZONE, newSimpleAggregatedMemoryContext(), INITIAL_BATCH_SIZE, RuntimeException::new);
    int positionCount = 0;
    while (true) {
        Page page = orcRecordReader.nextPage();
        if (page == null) {
            break;
        }
        page = page.getLoadedPage();
        Block block = page.getBlock(0);
        positionCount += block.getPositionCount();
    }
    assertEquals(positionCount, POSITION_COUNT);
}
Also used : DataSize(io.airlift.units.DataSize) Block(io.prestosql.spi.block.Block) Page(io.prestosql.spi.Page) StripeInformation(io.prestosql.orc.metadata.StripeInformation)

Example 5 with StripeInformation

use of io.prestosql.orc.metadata.StripeInformation in project hetu-core by openlookeng.

the class OrcWriter method bufferStripeData.

/**
 * Collect the data for for the stripe.  This is not the actual data, but
 * instead are functions that know how to write the data.
 */
private List<OrcDataOutput> bufferStripeData(long stripeStartOffset, FlushReason flushReason) throws IOException {
    if (stripeRowCount == 0) {
        verify(flushReason == CLOSED, "An empty stripe is not allowed");
        // column writers must be closed or the reset call will fail
        columnWriters.forEach(ColumnWriter::close);
        return ImmutableList.of();
    }
    if (rowGroupRowCount > 0) {
        finishRowGroup();
    }
    // convert any dictionary encoded column with a low compression ratio to direct
    dictionaryCompressionOptimizer.finalOptimize(bufferedBytes);
    columnWriters.forEach(ColumnWriter::close);
    List<OrcDataOutput> outputData = new ArrayList<>();
    List<Stream> allStreams = new ArrayList<>(columnWriters.size() * 3);
    // get index streams
    long indexLength = 0;
    for (ColumnWriter columnWriter : columnWriters) {
        for (StreamDataOutput indexStream : columnWriter.getIndexStreams(metadataWriter)) {
            // The ordering is critical because the stream only contain a length with no offset.
            outputData.add(indexStream);
            allStreams.add(indexStream.getStream());
            indexLength += indexStream.size();
        }
    }
    // data streams (sorted by size)
    long dataLength = 0;
    List<StreamDataOutput> dataStreams = new ArrayList<>(columnWriters.size() * 2);
    for (ColumnWriter columnWriter : columnWriters) {
        List<StreamDataOutput> streams = columnWriter.getDataStreams();
        dataStreams.addAll(streams);
        dataLength += streams.stream().mapToLong(StreamDataOutput::size).sum();
    }
    Collections.sort(dataStreams);
    // add data streams
    for (StreamDataOutput dataStream : dataStreams) {
        // The ordering is critical because the stream only contain a length with no offset.
        outputData.add(dataStream);
        allStreams.add(dataStream.getStream());
    }
    Map<OrcColumnId, ColumnEncoding> columnEncodings = new HashMap<>();
    columnWriters.forEach(columnWriter -> columnEncodings.putAll(columnWriter.getColumnEncodings()));
    Map<OrcColumnId, ColumnStatistics> columnStatistics = new HashMap<>();
    columnWriters.forEach(columnWriter -> columnStatistics.putAll(columnWriter.getColumnStripeStatistics()));
    // the 0th column is a struct column for the whole row
    columnEncodings.put(ROOT_COLUMN, new ColumnEncoding(DIRECT, 0));
    columnStatistics.put(ROOT_COLUMN, new ColumnStatistics((long) stripeRowCount, 0, null, null, null, null, null, null, null, null));
    // add footer
    StripeFooter stripeFooter = new StripeFooter(allStreams, toColumnMetadata(columnEncodings, orcTypes.size()), ZoneId.of("UTC"));
    Slice footer = metadataWriter.writeStripeFooter(stripeFooter);
    outputData.add(createDataOutput(footer));
    // create final stripe statistics
    StripeStatistics statistics = new StripeStatistics(toColumnMetadata(columnStatistics, orcTypes.size()));
    recordValidation(validation -> validation.addStripeStatistics(stripeStartOffset, statistics));
    StripeInformation stripeInformation = new StripeInformation(stripeRowCount, stripeStartOffset, indexLength, dataLength, footer.length());
    ClosedStripe closedStripe = new ClosedStripe(stripeInformation, statistics);
    closedStripes.add(closedStripe);
    closedStripesRetainedBytes += closedStripe.getRetainedSizeInBytes();
    recordValidation(validation -> validation.addStripe(stripeInformation.getNumberOfRows()));
    stats.recordStripeWritten(flushReason, stripeInformation.getTotalLength(), stripeInformation.getNumberOfRows(), dictionaryCompressionOptimizer.getDictionaryMemoryBytes());
    return outputData;
}
Also used : ColumnStatistics(io.prestosql.orc.metadata.statistics.ColumnStatistics) OrcColumnId(io.prestosql.orc.metadata.OrcColumnId) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) StripeStatistics(io.prestosql.orc.metadata.statistics.StripeStatistics) ColumnWriters.createColumnWriter(io.prestosql.orc.writer.ColumnWriters.createColumnWriter) ColumnWriter(io.prestosql.orc.writer.ColumnWriter) SliceDictionaryColumnWriter(io.prestosql.orc.writer.SliceDictionaryColumnWriter) StreamDataOutput(io.prestosql.orc.stream.StreamDataOutput) OrcDataOutput(io.prestosql.orc.stream.OrcDataOutput) ColumnEncoding(io.prestosql.orc.metadata.ColumnEncoding) StripeFooter(io.prestosql.orc.metadata.StripeFooter) Slices.utf8Slice(io.airlift.slice.Slices.utf8Slice) Slice(io.airlift.slice.Slice) Stream(io.prestosql.orc.metadata.Stream) StripeInformation(io.prestosql.orc.metadata.StripeInformation)

Aggregations

StripeInformation (io.prestosql.orc.metadata.StripeInformation)9 DataSize (io.airlift.units.DataSize)5 ArrayList (java.util.ArrayList)4 Slice (io.airlift.slice.Slice)3 ColumnEncoding (io.prestosql.orc.metadata.ColumnEncoding)3 Page (io.prestosql.spi.Page)3 Block (io.prestosql.spi.block.Block)3 Test (org.testng.annotations.Test)3 Stream (io.prestosql.orc.metadata.Stream)2 StripeFooter (io.prestosql.orc.metadata.StripeFooter)2 ColumnStatistics (io.prestosql.orc.metadata.statistics.ColumnStatistics)2 StripeStatistics (io.prestosql.orc.metadata.statistics.StripeStatistics)2 HashMap (java.util.HashMap)2 VisibleForTesting (com.google.common.annotations.VisibleForTesting)1 Preconditions.checkArgument (com.google.common.base.Preconditions.checkArgument)1 ImmutableList (com.google.common.collect.ImmutableList)1 ImmutableMap (com.google.common.collect.ImmutableMap)1 ImmutableSet (com.google.common.collect.ImmutableSet)1 Iterators (com.google.common.collect.Iterators)1 Maps (com.google.common.collect.Maps)1