Search in sources :

Example 1 with SelectiveStreamReader

use of com.facebook.presto.orc.reader.SelectiveStreamReader in project presto by prestodb.

the class TestOrcSelectiveStreamReaders method testEmptyStrings.

/**
 * This test tests SliceDirectSelectiveStreamReader for the case where all elements to read are empty strings. The output Block should be a valid VariableWidthBlock with an
 * empty Slice. It is to simulate a problem seen in production. The state of SliceDirectSelectiveStreamReader to reproduce the problem is:
 * - dataStream: null
 * - presentStream: null
 * - lengthStream: not null
 * - filter: null
 * - outputRequired: true
 * - offsets array: non zeros
 * The test issues two reads, the first one reads a non-empty string and populates non-zero offsets. The second one reads the empty string with the above conditions met.
 */
@Test
public void testEmptyStrings() throws Exception {
    Type type = VARCHAR;
    List<Type> types = ImmutableList.of(type);
    List<List<?>> values = ImmutableList.of(ImmutableList.of("a", ""));
    for (OrcTester.Format format : formats) {
        if (!types.stream().allMatch(readType -> format.supportsType(readType))) {
            return;
        }
        for (CompressionKind compression : compressions) {
            TempFile tempFile = new TempFile();
            writeOrcColumnsPresto(tempFile.getFile(), format, compression, Optional.empty(), types, values, new OrcWriterStats());
            OrcPredicate orcPredicate = createOrcPredicate(types, values, DWRF, false);
            Map<Integer, Type> includedColumns = IntStream.range(0, types.size()).boxed().collect(toImmutableMap(Function.identity(), types::get));
            List<Integer> outputColumns = IntStream.range(0, types.size()).boxed().collect(toImmutableList());
            OrcAggregatedMemoryContext systemMemoryUsage = new TestingHiveOrcAggregatedMemoryContext();
            try (OrcSelectiveRecordReader recordReader = createCustomOrcSelectiveRecordReader(tempFile.getFile(), format.getOrcEncoding(), orcPredicate, types, 1, ImmutableMap.of(), ImmutableList.of(), ImmutableMap.of(), OrcTester.OrcReaderSettings.builder().build().getRequiredSubfields(), ImmutableMap.of(), ImmutableMap.of(), includedColumns, outputColumns, false, systemMemoryUsage, false)) {
                assertEquals(recordReader.getReaderPosition(), 0);
                assertEquals(recordReader.getFilePosition(), 0);
                SelectiveStreamReader streamReader = recordReader.getStreamReaders()[0];
                // Read the first non-empty element. Do not call streamReader.getBlock() to preserve the offsets array in SliceDirectSelectiveStreamReader.
                int batchSize = min(recordReader.prepareNextBatch(), 1);
                int[] positions = IntStream.range(0, batchSize).toArray();
                streamReader.read(0, positions, batchSize);
                recordReader.batchRead(batchSize);
                // Read the second element: an empty string. Set the dataStream in SliceDirectSelectiveStreamReader to null to simulate the conditions causing the problem.
                ((SliceSelectiveStreamReader) streamReader).resetDataStream();
                batchSize = min(recordReader.prepareNextBatch(), 1);
                positions = IntStream.range(0, batchSize).toArray();
                streamReader.read(0, positions, batchSize);
                recordReader.batchRead(batchSize);
                Block block = streamReader.getBlock(positions, batchSize);
                List<?> expectedValues = ImmutableList.of("");
                assertBlockEquals(type, block, expectedValues, 0);
                assertEquals(recordReader.getReaderPosition(), 1);
                assertEquals(recordReader.getFilePosition(), 1);
            }
        }
    }
}
Also used : IntStream(java.util.stream.IntStream) OrcTester.createCustomOrcSelectiveRecordReader(com.facebook.presto.orc.OrcTester.createCustomOrcSelectiveRecordReader) VARCHAR(com.facebook.presto.common.type.VarcharType.VARCHAR) SliceSelectiveStreamReader(com.facebook.presto.orc.reader.SliceSelectiveStreamReader) ORC_11(com.facebook.presto.orc.OrcTester.Format.ORC_11) Assert.assertEquals(org.testng.Assert.assertEquals) Test(org.testng.annotations.Test) ORC_12(com.facebook.presto.orc.OrcTester.Format.ORC_12) Function(java.util.function.Function) OrcTester.assertBlockEquals(com.facebook.presto.orc.OrcTester.assertBlockEquals) ZLIB(com.facebook.presto.orc.metadata.CompressionKind.ZLIB) ImmutableList(com.google.common.collect.ImmutableList) Map(java.util.Map) OrcTester.writeOrcColumnsPresto(com.facebook.presto.orc.OrcTester.writeOrcColumnsPresto) Type(com.facebook.presto.common.type.Type) ImmutableSet(com.google.common.collect.ImmutableSet) ImmutableMap(com.google.common.collect.ImmutableMap) NONE(com.facebook.presto.orc.metadata.CompressionKind.NONE) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) Set(java.util.Set) SNAPPY(com.facebook.presto.orc.metadata.CompressionKind.SNAPPY) Math.min(java.lang.Math.min) DWRF(com.facebook.presto.orc.OrcTester.Format.DWRF) List(java.util.List) ImmutableMap.toImmutableMap(com.google.common.collect.ImmutableMap.toImmutableMap) SelectiveStreamReader(com.facebook.presto.orc.reader.SelectiveStreamReader) CompressionKind(com.facebook.presto.orc.metadata.CompressionKind) LZ4(com.facebook.presto.orc.metadata.CompressionKind.LZ4) TestingOrcPredicate.createOrcPredicate(com.facebook.presto.orc.TestingOrcPredicate.createOrcPredicate) Optional(java.util.Optional) Block(com.facebook.presto.common.block.Block) ZSTD(com.facebook.presto.orc.metadata.CompressionKind.ZSTD) OrcTester.createCustomOrcSelectiveRecordReader(com.facebook.presto.orc.OrcTester.createCustomOrcSelectiveRecordReader) CompressionKind(com.facebook.presto.orc.metadata.CompressionKind) Type(com.facebook.presto.common.type.Type) SliceSelectiveStreamReader(com.facebook.presto.orc.reader.SliceSelectiveStreamReader) SelectiveStreamReader(com.facebook.presto.orc.reader.SelectiveStreamReader) Block(com.facebook.presto.common.block.Block) ImmutableList(com.google.common.collect.ImmutableList) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) List(java.util.List) SliceSelectiveStreamReader(com.facebook.presto.orc.reader.SliceSelectiveStreamReader) TestingOrcPredicate.createOrcPredicate(com.facebook.presto.orc.TestingOrcPredicate.createOrcPredicate) Test(org.testng.annotations.Test)

Example 2 with SelectiveStreamReader

use of com.facebook.presto.orc.reader.SelectiveStreamReader in project presto by prestodb.

the class OrcSelectiveRecordReader method createStreamReaders.

private static SelectiveStreamReader[] createStreamReaders(OrcDataSource orcDataSource, List<OrcType> types, DateTimeZone hiveStorageTimeZone, OrcRecordReaderOptions options, boolean legacyMapSubscript, Map<Integer, Type> includedColumns, List<Integer> outputColumns, Map<Integer, Map<Subfield, TupleDomainFilter>> filters, List<FilterFunction> filterFunctions, Map<Integer, Integer> filterFunctionInputMapping, Map<Integer, List<Subfield>> requiredSubfields, OrcAggregatedMemoryContext systemMemoryContext) {
    List<StreamDescriptor> streamDescriptors = createStreamDescriptor("", "", 0, types, orcDataSource).getNestedStreams();
    requireNonNull(filterFunctions, "filterFunctions is null");
    requireNonNull(filterFunctionInputMapping, "filterFunctionInputMapping is null");
    Set<Integer> filterFunctionInputColumns = filterFunctions.stream().flatMapToInt(function -> Arrays.stream(function.getInputChannels())).boxed().map(filterFunctionInputMapping::get).collect(toImmutableSet());
    OrcType rowType = types.get(0);
    SelectiveStreamReader[] streamReaders = new SelectiveStreamReader[rowType.getFieldCount()];
    for (int columnId = 0; columnId < rowType.getFieldCount(); columnId++) {
        if (includedColumns.containsKey(columnId)) {
            StreamDescriptor streamDescriptor = streamDescriptors.get(columnId);
            boolean outputRequired = outputColumns.contains(columnId) || filterFunctionInputColumns.contains(columnId);
            streamReaders[columnId] = createStreamReader(streamDescriptor, Optional.ofNullable(filters.get(columnId)).orElse(ImmutableMap.of()), outputRequired ? Optional.of(includedColumns.get(columnId)) : Optional.empty(), Optional.ofNullable(requiredSubfields.get(columnId)).orElse(ImmutableList.of()), hiveStorageTimeZone, options, legacyMapSubscript, systemMemoryContext);
        }
    }
    return streamReaders;
}
Also used : Page(com.facebook.presto.common.Page) DateTimeZone(org.joda.time.DateTimeZone) Arrays(java.util.Arrays) RunLengthEncodedBlock(com.facebook.presto.common.block.RunLengthEncodedBlock) BigintRange(com.facebook.presto.common.predicate.TupleDomainFilter.BigintRange) Preconditions.checkArgument(com.google.common.base.Preconditions.checkArgument) Predicates.not(com.google.common.base.Predicates.not) Map(java.util.Map) StripeInformation(com.facebook.presto.orc.metadata.StripeInformation) Varchars.isVarcharType(com.facebook.presto.common.type.Varchars.isVarcharType) RuntimeStats(com.facebook.presto.common.RuntimeStats) BlockLease(com.facebook.presto.common.block.BlockLease) BigintValuesUsingHashTable(com.facebook.presto.common.predicate.TupleDomainFilter.BigintValuesUsingHashTable) ImmutableMap(com.google.common.collect.ImmutableMap) DOUBLE(com.facebook.presto.common.type.DoubleType.DOUBLE) Collection(java.util.Collection) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) Set(java.util.Set) Math.min(java.lang.Math.min) ColumnStatistics(com.facebook.presto.orc.metadata.statistics.ColumnStatistics) UncheckedIOException(java.io.UncheckedIOException) Objects(java.util.Objects) List(java.util.List) ImmutableMap.toImmutableMap(com.google.common.collect.ImmutableMap.toImmutableMap) ClassLayout(org.openjdk.jol.info.ClassLayout) SizeOf.sizeOf(io.airlift.slice.SizeOf.sizeOf) LazyBlockLoader(com.facebook.presto.common.block.LazyBlockLoader) SelectiveStreamReader(com.facebook.presto.orc.reader.SelectiveStreamReader) INTEGER(com.facebook.presto.common.type.IntegerType.INTEGER) SelectiveStreamReaders.createStreamReader(com.facebook.presto.orc.reader.SelectiveStreamReaders.createStreamReader) Optional(java.util.Optional) Math.max(java.lang.Math.max) LazyBlock(com.facebook.presto.common.block.LazyBlock) IntStream(java.util.stream.IntStream) Iterables(com.google.common.collect.Iterables) DecimalType(com.facebook.presto.common.type.DecimalType) Slice(io.airlift.slice.Slice) TINYINT(com.facebook.presto.common.type.TinyintType.TINYINT) HashMap(java.util.HashMap) TIMESTAMP(com.facebook.presto.common.type.TimestampType.TIMESTAMP) Function(java.util.function.Function) DATE(com.facebook.presto.common.type.DateType.DATE) REAL(com.facebook.presto.common.type.RealType.REAL) ArrayList(java.util.ArrayList) Subfield(com.facebook.presto.common.Subfield) ImmutableList(com.google.common.collect.ImmutableList) Verify.verify(com.google.common.base.Verify.verify) FilterFunction(com.facebook.presto.common.predicate.FilterFunction) Objects.requireNonNull(java.util.Objects.requireNonNull) BOOLEAN(com.facebook.presto.common.type.BooleanType.BOOLEAN) ImmutableSet.toImmutableSet(com.google.common.collect.ImmutableSet.toImmutableSet) CharType(com.facebook.presto.common.type.CharType) OrcType(com.facebook.presto.orc.metadata.OrcType) Type(com.facebook.presto.common.type.Type) Nullable(javax.annotation.Nullable) MAX_BATCH_SIZE(com.facebook.presto.orc.OrcReader.MAX_BATCH_SIZE) BIGINT(com.facebook.presto.common.type.BigintType.BIGINT) BigintMultiRange(com.facebook.presto.common.predicate.TupleDomainFilter.BigintMultiRange) StripeStatistics(com.facebook.presto.orc.metadata.statistics.StripeStatistics) PostScript(com.facebook.presto.orc.metadata.PostScript) LongArrayBlock(com.facebook.presto.common.block.LongArrayBlock) IOException(java.io.IOException) Maps(com.google.common.collect.Maps) Ints(com.google.common.primitives.Ints) BigintValuesUsingBitmask(com.facebook.presto.common.predicate.TupleDomainFilter.BigintValuesUsingBitmask) TupleDomainFilter(com.facebook.presto.common.predicate.TupleDomainFilter) SMALLINT(com.facebook.presto.common.type.SmallintType.SMALLINT) Block(com.facebook.presto.common.block.Block) Comparator(java.util.Comparator) MetadataReader(com.facebook.presto.orc.metadata.MetadataReader) OrcType(com.facebook.presto.orc.metadata.OrcType) SelectiveStreamReader(com.facebook.presto.orc.reader.SelectiveStreamReader)

Example 3 with SelectiveStreamReader

use of com.facebook.presto.orc.reader.SelectiveStreamReader in project presto by prestodb.

the class OrcSelectiveRecordReader method getNextPage.

public Page getNextPage() throws IOException {
    if (constantFilterIsFalse) {
        return null;
    }
    int batchSize = prepareNextBatch();
    if (batchSize < 0) {
        return null;
    }
    readPositions += batchSize;
    initializePositions(batchSize);
    int[] positionsToRead = this.positions;
    int positionCount = batchSize;
    if (filterFunctionWithoutInput.isPresent()) {
        positionCount = applyFilterFunctionWithNoInputs(positionCount);
        if (positionCount == 0) {
            batchRead(batchSize);
            return EMPTY_PAGE;
        }
        positionsToRead = outputPositions;
    }
    if (!filterFunctionsWithConstantInputs.isEmpty()) {
        positionCount = applyFilterFunctions(filterFunctionsWithConstantInputs, filterFunctionConstantInputs, positionsToRead, positionCount);
        if (positionCount == 0) {
            batchRead(batchSize);
            return EMPTY_PAGE;
        }
        positionsToRead = outputPositions;
    }
    int offset = getNextRowInGroup();
    if (reorderFilters && offset >= MAX_BATCH_SIZE) {
        reorderFiltersIfNeeded();
    }
    for (int i = 0; i < streamReaderOrder.length; i++) {
        int columnIndex = streamReaderOrder[i];
        if (!hasAnyFilter(columnIndex)) {
            break;
        }
        SelectiveStreamReader streamReader = getStreamReader(columnIndex);
        positionCount = streamReader.read(offset, positionsToRead, positionCount);
        if (positionCount == 0) {
            break;
        }
        positionsToRead = streamReader.getReadPositions();
        verify(positionCount == 1 || positionsToRead[positionCount - 1] - positionsToRead[0] >= positionCount - 1, "positions must monotonically increase");
        if (filterFunctionsOrder[i] != null) {
            positionCount = applyFilterFunctions(filterFunctionsOrder[i], filterFunctionInputs[i], positionsToRead, positionCount);
            if (positionCount == 0) {
                break;
            }
            positionsToRead = outputPositions;
        }
    }
    localMemoryContext.setBytes(getSelfRetainedSizeInBytes());
    batchRead(batchSize);
    if (positionCount == 0) {
        return EMPTY_PAGE;
    }
    if (constantFilterError != null) {
        throw constantFilterError;
    }
    for (int i = 0; i < positionCount; i++) {
        if (errors[positionsToRead[i]] != null) {
            throw errors[positionsToRead[i]];
        }
    }
    for (SelectiveStreamReader reader : getStreamReaders()) {
        if (reader != null) {
            reader.throwAnyError(positionsToRead, positionCount);
        }
    }
    Block[] blocks = new Block[appendRowNumber ? outputColumns.size() + 1 : outputColumns.size()];
    for (int i = 0; i < outputColumns.size(); i++) {
        int columnIndex = outputColumns.get(i);
        if (constantValues[columnIndex] != null) {
            blocks[i] = RunLengthEncodedBlock.create(columnTypes.get(columnIndex), constantValues[columnIndex] == NULL_MARKER ? null : constantValues[columnIndex], positionCount);
        } else if (!hasAnyFilter(columnIndex)) {
            blocks[i] = new LazyBlock(positionCount, new OrcBlockLoader(columnIndex, offset, positionsToRead, positionCount));
        } else {
            Block block = getStreamReader(columnIndex).getBlock(positionsToRead, positionCount);
            updateMaxCombinedBytesPerRow(hiveColumnIndices[columnIndex], block);
            if (coercers[columnIndex] != null) {
                block = coercers[columnIndex].apply(block);
            }
            blocks[i] = block;
        }
    }
    if (appendRowNumber) {
        blocks[outputColumns.size()] = createRowNumbersBlock(positionsToRead, positionCount, this.getFilePosition());
    }
    Page page = new Page(positionCount, blocks);
    validateWritePageChecksum(page);
    return page;
}
Also used : LazyBlock(com.facebook.presto.common.block.LazyBlock) SelectiveStreamReader(com.facebook.presto.orc.reader.SelectiveStreamReader) RunLengthEncodedBlock(com.facebook.presto.common.block.RunLengthEncodedBlock) LazyBlock(com.facebook.presto.common.block.LazyBlock) LongArrayBlock(com.facebook.presto.common.block.LongArrayBlock) Block(com.facebook.presto.common.block.Block) Page(com.facebook.presto.common.Page)

Aggregations

Block (com.facebook.presto.common.block.Block)3 SelectiveStreamReader (com.facebook.presto.orc.reader.SelectiveStreamReader)3 Page (com.facebook.presto.common.Page)2 LazyBlock (com.facebook.presto.common.block.LazyBlock)2 LongArrayBlock (com.facebook.presto.common.block.LongArrayBlock)2 RunLengthEncodedBlock (com.facebook.presto.common.block.RunLengthEncodedBlock)2 Type (com.facebook.presto.common.type.Type)2 ImmutableList (com.google.common.collect.ImmutableList)2 ImmutableList.toImmutableList (com.google.common.collect.ImmutableList.toImmutableList)2 ImmutableMap (com.google.common.collect.ImmutableMap)2 ImmutableMap.toImmutableMap (com.google.common.collect.ImmutableMap.toImmutableMap)2 Math.min (java.lang.Math.min)2 List (java.util.List)2 Map (java.util.Map)2 Optional (java.util.Optional)2 Set (java.util.Set)2 Function (java.util.function.Function)2 IntStream (java.util.stream.IntStream)2 RuntimeStats (com.facebook.presto.common.RuntimeStats)1 Subfield (com.facebook.presto.common.Subfield)1