Search in sources :

Example 6 with OrcColumnId

use of io.prestosql.orc.metadata.OrcColumnId in project hetu-core by openlookeng.

the class PresentOutputStream method getStreamDataOutput.

public Optional<StreamDataOutput> getStreamDataOutput(OrcColumnId columnId) {
    checkArgument(closed);
    if (booleanOutputStream == null) {
        return Optional.empty();
    }
    StreamDataOutput streamDataOutput = booleanOutputStream.getStreamDataOutput(columnId);
    // rewrite the DATA stream created by the boolean output stream to a PRESENT stream
    Stream stream = new Stream(columnId, PRESENT, toIntExact(streamDataOutput.size()), streamDataOutput.getStream().isUseVInts());
    return Optional.of(new StreamDataOutput(sliceOutput -> {
        streamDataOutput.writeData(sliceOutput);
        return stream.getLength();
    }, stream));
}
Also used : BooleanStreamCheckpoint(io.prestosql.orc.checkpoint.BooleanStreamCheckpoint) PRESENT(io.prestosql.orc.metadata.Stream.StreamKind.PRESENT) OrcColumnId(io.prestosql.orc.metadata.OrcColumnId) Stream(io.prestosql.orc.metadata.Stream) OrcOutputBuffer(io.prestosql.orc.OrcOutputBuffer) ArrayList(java.util.ArrayList) Preconditions.checkState(com.google.common.base.Preconditions.checkState) List(java.util.List) Preconditions.checkArgument(com.google.common.base.Preconditions.checkArgument) ClassLayout(org.openjdk.jol.info.ClassLayout) CompressionKind(io.prestosql.orc.metadata.CompressionKind) Optional(java.util.Optional) Math.toIntExact(java.lang.Math.toIntExact) Nullable(javax.annotation.Nullable) Stream(io.prestosql.orc.metadata.Stream)

Example 7 with OrcColumnId

use of io.prestosql.orc.metadata.OrcColumnId in project hetu-core by openlookeng.

the class TestOrcReaderPositions method testRowGroupSkipping.

@Test
public void testRowGroupSkipping() throws Exception {
    try (TempFile tempFile = new TempFile()) {
        // create single strip file with multiple row groups
        int rowCount = 142_000;
        createSequentialFile(tempFile.getFile(), rowCount);
        // test reading two row groups from middle of file
        OrcPredicate predicate = (numberOfRows, allColumnStatistics) -> {
            if (numberOfRows == rowCount) {
                return true;
            }
            IntegerStatistics stats = allColumnStatistics.get(new OrcColumnId(1)).getIntegerStatistics();
            return (stats.getMin() == 50_000) || (stats.getMin() == 60_000);
        };
        try (OrcRecordReader reader = createCustomOrcRecordReader(tempFile, predicate, BIGINT, MAX_BATCH_SIZE)) {
            assertEquals(reader.getFileRowCount(), rowCount);
            assertEquals(reader.getReaderRowCount(), rowCount);
            assertEquals(reader.getFilePosition(), 0);
            assertEquals(reader.getReaderPosition(), 0);
            long position = 50_000;
            while (true) {
                Page page = reader.nextPage();
                if (page == null) {
                    break;
                }
                page = page.getLoadedPage();
                Block block = page.getBlock(0);
                for (int i = 0; i < block.getPositionCount(); i++) {
                    assertEquals(BIGINT.getLong(block, i), position + i);
                }
                assertEquals(reader.getFilePosition(), position);
                assertEquals(reader.getReaderPosition(), position);
                position += page.getPositionCount();
            }
            assertEquals(position, 70_000);
            assertEquals(reader.getFilePosition(), rowCount);
            assertEquals(reader.getReaderPosition(), rowCount);
        }
    }
}
Also used : Footer(io.prestosql.orc.metadata.Footer) IntegerStatistics(io.prestosql.orc.metadata.statistics.IntegerStatistics) OrcFile(org.apache.hadoop.hive.ql.io.orc.OrcFile) Slice(io.airlift.slice.Slice) Assert.assertNull(org.testng.Assert.assertNull) MEGABYTE(io.airlift.units.DataSize.Unit.MEGABYTE) Assert.assertEquals(org.testng.Assert.assertEquals) Writable(org.apache.hadoop.io.Writable) Test(org.testng.annotations.Test) OrcTester.createOrcRecordWriter(io.prestosql.orc.OrcTester.createOrcRecordWriter) OrcColumnId(io.prestosql.orc.metadata.OrcColumnId) OrcTester.createSettableStructObjectInspector(io.prestosql.orc.OrcTester.createSettableStructObjectInspector) OrcSerde(org.apache.hadoop.hive.ql.io.orc.OrcSerde) ByteBuffer(java.nio.ByteBuffer) MAX_BLOCK_SIZE(io.prestosql.orc.OrcTester.MAX_BLOCK_SIZE) Writer(org.apache.hadoop.hive.ql.io.orc.Writer) INITIAL_BATCH_SIZE(io.prestosql.orc.OrcReader.INITIAL_BATCH_SIZE) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) MAX_BATCH_SIZE(io.prestosql.orc.OrcReader.MAX_BATCH_SIZE) ORC_12(io.prestosql.orc.OrcTester.Format.ORC_12) VARCHAR(io.prestosql.spi.type.VarcharType.VARCHAR) OrcTester.createCustomOrcRecordReader(io.prestosql.orc.OrcTester.createCustomOrcRecordReader) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) BATCH_SIZE_GROWTH_FACTOR(io.prestosql.orc.OrcReader.BATCH_SIZE_GROWTH_FACTOR) BIGINT(io.prestosql.spi.type.BigintType.BIGINT) Block(io.prestosql.spi.block.Block) NullMemoryManager(org.apache.orc.NullMemoryManager) ImmutableMap(com.google.common.collect.ImmutableMap) UTF_8(java.nio.charset.StandardCharsets.UTF_8) SNAPPY(org.apache.hadoop.hive.ql.io.orc.CompressionKind.SNAPPY) Assert.fail(org.testng.Assert.fail) Page(io.prestosql.spi.Page) IOException(java.io.IOException) Math.min(java.lang.Math.min) Field(java.lang.reflect.Field) Maps(com.google.common.collect.Maps) File(java.io.File) SettableStructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.SettableStructObjectInspector) DataSize(io.airlift.units.DataSize) Serializer(org.apache.hadoop.hive.serde2.Serializer) CompressionKind(io.prestosql.orc.metadata.CompressionKind) Assert.assertTrue(org.testng.Assert.assertTrue) OrcOutputFormat(org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat) SerDeException(org.apache.hadoop.hive.serde2.SerDeException) StructField(org.apache.hadoop.hive.serde2.objectinspector.StructField) OrcColumnId(io.prestosql.orc.metadata.OrcColumnId) Block(io.prestosql.spi.block.Block) Page(io.prestosql.spi.Page) OrcTester.createCustomOrcRecordReader(io.prestosql.orc.OrcTester.createCustomOrcRecordReader) IntegerStatistics(io.prestosql.orc.metadata.statistics.IntegerStatistics) Test(org.testng.annotations.Test)

Example 8 with OrcColumnId

use of io.prestosql.orc.metadata.OrcColumnId in project hetu-core by openlookeng.

the class TestOrcReaderPositions method testStripeSkipping.

@Test
public void testStripeSkipping() throws Exception {
    try (TempFile tempFile = new TempFile()) {
        createMultiStripeFile(tempFile.getFile());
        // test reading second and fourth stripes
        OrcPredicate predicate = (numberOfRows, allColumnStatistics) -> {
            if (numberOfRows == 100) {
                return true;
            }
            IntegerStatistics stats = allColumnStatistics.get(new OrcColumnId(1)).getIntegerStatistics();
            return ((stats.getMin() == 60) && (stats.getMax() == 117)) || ((stats.getMin() == 180) && (stats.getMax() == 237));
        };
        try (OrcRecordReader reader = createCustomOrcRecordReader(tempFile, predicate, BIGINT, MAX_BATCH_SIZE)) {
            assertEquals(reader.getFileRowCount(), 100);
            assertEquals(reader.getReaderRowCount(), 40);
            assertEquals(reader.getFilePosition(), 0);
            assertEquals(reader.getReaderPosition(), 0);
            // second stripe
            Page page = reader.nextPage().getLoadedPage();
            assertEquals(page.getPositionCount(), 20);
            assertEquals(reader.getReaderPosition(), 0);
            assertEquals(reader.getFilePosition(), 20);
            assertCurrentBatch(page, 1);
            // fourth stripe
            page = reader.nextPage().getLoadedPage();
            assertEquals(page.getPositionCount(), 20);
            assertEquals(reader.getReaderPosition(), 20);
            assertEquals(reader.getFilePosition(), 60);
            assertCurrentBatch(page, 3);
            page = reader.nextPage();
            assertNull(page);
            assertEquals(reader.getReaderPosition(), 40);
            assertEquals(reader.getFilePosition(), 100);
        }
    }
}
Also used : Footer(io.prestosql.orc.metadata.Footer) IntegerStatistics(io.prestosql.orc.metadata.statistics.IntegerStatistics) OrcFile(org.apache.hadoop.hive.ql.io.orc.OrcFile) Slice(io.airlift.slice.Slice) Assert.assertNull(org.testng.Assert.assertNull) MEGABYTE(io.airlift.units.DataSize.Unit.MEGABYTE) Assert.assertEquals(org.testng.Assert.assertEquals) Writable(org.apache.hadoop.io.Writable) Test(org.testng.annotations.Test) OrcTester.createOrcRecordWriter(io.prestosql.orc.OrcTester.createOrcRecordWriter) OrcColumnId(io.prestosql.orc.metadata.OrcColumnId) OrcTester.createSettableStructObjectInspector(io.prestosql.orc.OrcTester.createSettableStructObjectInspector) OrcSerde(org.apache.hadoop.hive.ql.io.orc.OrcSerde) ByteBuffer(java.nio.ByteBuffer) MAX_BLOCK_SIZE(io.prestosql.orc.OrcTester.MAX_BLOCK_SIZE) Writer(org.apache.hadoop.hive.ql.io.orc.Writer) INITIAL_BATCH_SIZE(io.prestosql.orc.OrcReader.INITIAL_BATCH_SIZE) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) MAX_BATCH_SIZE(io.prestosql.orc.OrcReader.MAX_BATCH_SIZE) ORC_12(io.prestosql.orc.OrcTester.Format.ORC_12) VARCHAR(io.prestosql.spi.type.VarcharType.VARCHAR) OrcTester.createCustomOrcRecordReader(io.prestosql.orc.OrcTester.createCustomOrcRecordReader) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) BATCH_SIZE_GROWTH_FACTOR(io.prestosql.orc.OrcReader.BATCH_SIZE_GROWTH_FACTOR) BIGINT(io.prestosql.spi.type.BigintType.BIGINT) Block(io.prestosql.spi.block.Block) NullMemoryManager(org.apache.orc.NullMemoryManager) ImmutableMap(com.google.common.collect.ImmutableMap) UTF_8(java.nio.charset.StandardCharsets.UTF_8) SNAPPY(org.apache.hadoop.hive.ql.io.orc.CompressionKind.SNAPPY) Assert.fail(org.testng.Assert.fail) Page(io.prestosql.spi.Page) IOException(java.io.IOException) Math.min(java.lang.Math.min) Field(java.lang.reflect.Field) Maps(com.google.common.collect.Maps) File(java.io.File) SettableStructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.SettableStructObjectInspector) DataSize(io.airlift.units.DataSize) Serializer(org.apache.hadoop.hive.serde2.Serializer) CompressionKind(io.prestosql.orc.metadata.CompressionKind) Assert.assertTrue(org.testng.Assert.assertTrue) OrcOutputFormat(org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat) SerDeException(org.apache.hadoop.hive.serde2.SerDeException) StructField(org.apache.hadoop.hive.serde2.objectinspector.StructField) OrcColumnId(io.prestosql.orc.metadata.OrcColumnId) Page(io.prestosql.spi.Page) OrcTester.createCustomOrcRecordReader(io.prestosql.orc.OrcTester.createCustomOrcRecordReader) IntegerStatistics(io.prestosql.orc.metadata.statistics.IntegerStatistics) Test(org.testng.annotations.Test)

Example 9 with OrcColumnId

use of io.prestosql.orc.metadata.OrcColumnId in project hetu-core by openlookeng.

the class AbstractTestValueStream method testWriteValue.

protected void testWriteValue(List<List<T>> groups) throws IOException {
    W outputStream = createValueOutputStream();
    for (int i = 0; i < 3; i++) {
        outputStream.reset();
        long retainedBytes = 0;
        for (List<T> group : groups) {
            outputStream.recordCheckpoint();
            group.forEach(value -> writeValue(outputStream, value));
            assertTrue(outputStream.getRetainedBytes() >= retainedBytes);
            retainedBytes = outputStream.getRetainedBytes();
        }
        outputStream.close();
        DynamicSliceOutput sliceOutput = new DynamicSliceOutput(1000);
        StreamDataOutput streamDataOutput = outputStream.getStreamDataOutput(new OrcColumnId(33));
        streamDataOutput.writeData(sliceOutput);
        Stream stream = streamDataOutput.getStream();
        assertEquals(stream.getStreamKind(), StreamKind.DATA);
        assertEquals(stream.getColumnId(), new OrcColumnId(33));
        assertEquals(stream.getLength(), sliceOutput.size());
        List<C> checkpoints = outputStream.getCheckpoints();
        assertEquals(checkpoints.size(), groups.size());
        R valueStream = createValueStream(sliceOutput.slice());
        for (List<T> group : groups) {
            int index = 0;
            for (T expectedValue : group) {
                index++;
                T actualValue = readValue(valueStream);
                if (!actualValue.equals(expectedValue)) {
                    assertEquals(actualValue, expectedValue, "index=" + index);
                }
            }
        }
        for (int groupIndex = groups.size() - 1; groupIndex >= 0; groupIndex--) {
            valueStream.seekToCheckpoint(checkpoints.get(groupIndex));
            for (T expectedValue : groups.get(groupIndex)) {
                T actualValue = readValue(valueStream);
                if (!actualValue.equals(expectedValue)) {
                    assertEquals(actualValue, expectedValue);
                }
            }
        }
    }
}
Also used : OrcColumnId(io.prestosql.orc.metadata.OrcColumnId) DynamicSliceOutput(io.airlift.slice.DynamicSliceOutput) Stream(io.prestosql.orc.metadata.Stream) StreamCheckpoint(io.prestosql.orc.checkpoint.StreamCheckpoint)

Example 10 with OrcColumnId

use of io.prestosql.orc.metadata.OrcColumnId in project hetu-core by openlookeng.

the class ColumnWriters method createColumnWriter.

public static ColumnWriter createColumnWriter(OrcColumnId columnId, ColumnMetadata<OrcType> orcTypes, Type type, CompressionKind compression, int bufferSize, DataSize stringStatisticsLimit) {
    requireNonNull(type, "type is null");
    OrcType orcType = orcTypes.get(columnId);
    switch(orcType.getOrcTypeKind()) {
        case BOOLEAN:
            return new BooleanColumnWriter(columnId, type, compression, bufferSize);
        case FLOAT:
            return new FloatColumnWriter(columnId, type, compression, bufferSize);
        case DOUBLE:
            return new DoubleColumnWriter(columnId, type, compression, bufferSize);
        case BYTE:
            return new ByteColumnWriter(columnId, type, compression, bufferSize);
        case DATE:
            return new LongColumnWriter(columnId, type, compression, bufferSize, DateStatisticsBuilder::new);
        case SHORT:
        case INT:
        case LONG:
            return new LongColumnWriter(columnId, type, compression, bufferSize, IntegerStatisticsBuilder::new);
        case DECIMAL:
            return new DecimalColumnWriter(columnId, type, compression, bufferSize);
        case TIMESTAMP:
            return new TimestampColumnWriter(columnId, type, compression, bufferSize);
        case BINARY:
            return new SliceDirectColumnWriter(columnId, type, compression, bufferSize, BinaryStatisticsBuilder::new);
        case CHAR:
        case VARCHAR:
        case STRING:
            return new SliceDictionaryColumnWriter(columnId, type, compression, bufferSize, stringStatisticsLimit);
        case LIST:
            {
                OrcColumnId fieldColumnIndex = orcType.getFieldTypeIndex(0);
                Type fieldType = type.getTypeParameters().get(0);
                ColumnWriter elementWriter = createColumnWriter(fieldColumnIndex, orcTypes, fieldType, compression, bufferSize, stringStatisticsLimit);
                return new ListColumnWriter(columnId, compression, bufferSize, elementWriter);
            }
        case MAP:
            {
                ColumnWriter keyWriter = createColumnWriter(orcType.getFieldTypeIndex(0), orcTypes, type.getTypeParameters().get(0), compression, bufferSize, stringStatisticsLimit);
                ColumnWriter valueWriter = createColumnWriter(orcType.getFieldTypeIndex(1), orcTypes, type.getTypeParameters().get(1), compression, bufferSize, stringStatisticsLimit);
                return new MapColumnWriter(columnId, compression, bufferSize, keyWriter, valueWriter);
            }
        case STRUCT:
            {
                ImmutableList.Builder<ColumnWriter> fieldWriters = ImmutableList.builder();
                for (int fieldId = 0; fieldId < orcType.getFieldCount(); fieldId++) {
                    OrcColumnId fieldColumnIndex = orcType.getFieldTypeIndex(fieldId);
                    Type fieldType = type.getTypeParameters().get(fieldId);
                    fieldWriters.add(createColumnWriter(fieldColumnIndex, orcTypes, fieldType, compression, bufferSize, stringStatisticsLimit));
                }
                return new StructColumnWriter(columnId, compression, bufferSize, fieldWriters.build());
            }
    }
    throw new IllegalArgumentException("Unsupported type: " + type);
}
Also used : DateStatisticsBuilder(io.prestosql.orc.metadata.statistics.DateStatisticsBuilder) BinaryStatisticsBuilder(io.prestosql.orc.metadata.statistics.BinaryStatisticsBuilder) OrcColumnId(io.prestosql.orc.metadata.OrcColumnId) BinaryStatisticsBuilder(io.prestosql.orc.metadata.statistics.BinaryStatisticsBuilder) IntegerStatisticsBuilder(io.prestosql.orc.metadata.statistics.IntegerStatisticsBuilder) DateStatisticsBuilder(io.prestosql.orc.metadata.statistics.DateStatisticsBuilder) IntegerStatisticsBuilder(io.prestosql.orc.metadata.statistics.IntegerStatisticsBuilder) OrcType(io.prestosql.orc.metadata.OrcType) Type(io.prestosql.spi.type.Type) OrcType(io.prestosql.orc.metadata.OrcType)

Aggregations

OrcColumnId (io.prestosql.orc.metadata.OrcColumnId)23 Stream (io.prestosql.orc.metadata.Stream)9 ImmutableMap (com.google.common.collect.ImmutableMap)8 ColumnStatistics (io.prestosql.orc.metadata.statistics.ColumnStatistics)7 Test (org.testng.annotations.Test)7 ArrayList (java.util.ArrayList)6 List (java.util.List)6 Slice (io.airlift.slice.Slice)5 ImmutableList (com.google.common.collect.ImmutableList)4 CompressionKind (io.prestosql.orc.metadata.CompressionKind)4 ColumnReader (io.prestosql.orc.reader.ColumnReader)4 DateColumnReader (io.prestosql.orc.reader.DateColumnReader)4 IntegerColumnReader (io.prestosql.orc.reader.IntegerColumnReader)4 LongColumnReader (io.prestosql.orc.reader.LongColumnReader)4 ShortColumnReader (io.prestosql.orc.reader.ShortColumnReader)4 OrcInputStream (io.prestosql.orc.stream.OrcInputStream)4 InputStream (java.io.InputStream)4 Map (java.util.Map)4 ColumnEncoding (io.prestosql.orc.metadata.ColumnEncoding)3 StripeFooter (io.prestosql.orc.metadata.StripeFooter)3