Search in sources :

Example 6 with OrcMetadataReader

use of com.facebook.presto.orc.metadata.OrcMetadataReader in project presto by prestodb.

the class TestCachingOrcDataSource method doIntegration.

public void doIntegration(TestingOrcDataSource orcDataSource, DataSize maxMergeDistance, DataSize maxReadSize) throws IOException {
    OrcReader orcReader = new OrcReader(orcDataSource, new OrcMetadataReader(), maxMergeDistance, maxReadSize);
    // 1 for reading file footer
    assertEquals(orcDataSource.getReadCount(), 1);
    List<StripeInformation> stripes = orcReader.getFooter().getStripes();
    // Sanity check number of stripes. This can be three or higher because of orc writer low memory mode.
    assertGreaterThanOrEqual(stripes.size(), 3);
    //verify wrapped by CachingOrcReader
    assertInstanceOf(wrapWithCacheIfTinyStripes(orcDataSource, stripes, maxMergeDistance, maxReadSize), CachingOrcDataSource.class);
    OrcRecordReader orcRecordReader = orcReader.createRecordReader(ImmutableMap.of(0, VARCHAR), (numberOfRows, statisticsByColumnIndex) -> true, HIVE_STORAGE_TIME_ZONE, new AggregatedMemoryContext());
    int positionCount = 0;
    while (true) {
        int batchSize = orcRecordReader.nextBatch();
        if (batchSize <= 0) {
            break;
        }
        Block block = orcRecordReader.readBlock(VARCHAR, 0);
        positionCount += block.getPositionCount();
    }
    assertEquals(positionCount, POSITION_COUNT);
}
Also used : OrcMetadataReader(com.facebook.presto.orc.metadata.OrcMetadataReader) Block(com.facebook.presto.spi.block.Block) AggregatedMemoryContext(com.facebook.presto.orc.memory.AggregatedMemoryContext) StripeInformation(com.facebook.presto.orc.metadata.StripeInformation)

Example 7 with OrcMetadataReader

use of com.facebook.presto.orc.metadata.OrcMetadataReader in project presto by prestodb.

the class TestOrcBloomFilters method testOrcHiveBloomFilterSerde.

@Test
public void testOrcHiveBloomFilterSerde() throws Exception {
    BloomFilter bloomFilterWrite = new BloomFilter(1000L, 0.05);
    bloomFilterWrite.addString(TEST_STRING);
    assertTrue(bloomFilterWrite.testString(TEST_STRING));
    OrcProto.BloomFilter.Builder bloomFilterBuilder = OrcProto.BloomFilter.newBuilder();
    bloomFilterBuilder.addAllBitset(Longs.asList(bloomFilterWrite.getBitSet()));
    bloomFilterBuilder.setNumHashFunctions(bloomFilterWrite.getNumHashFunctions());
    OrcProto.BloomFilter bloomFilter = bloomFilterBuilder.build();
    OrcProto.BloomFilterIndex bloomFilterIndex = OrcProto.BloomFilterIndex.getDefaultInstance();
    byte[] bytes = serializeBloomFilterToIndex(bloomFilter, bloomFilterIndex);
    // Read through method
    InputStream inputStream = new ByteArrayInputStream(bytes);
    OrcMetadataReader metadataReader = new OrcMetadataReader();
    List<HiveBloomFilter> bloomFilters = metadataReader.readBloomFilterIndexes(inputStream);
    assertEquals(bloomFilters.size(), 1);
    assertTrue(bloomFilters.get(0).testString(TEST_STRING));
    assertFalse(bloomFilters.get(0).testString(TEST_STRING_NOT_WRITTEN));
    assertEquals(bloomFilterWrite.getBitSize(), bloomFilters.get(0).getBitSize());
    assertEquals(bloomFilterWrite.getNumHashFunctions(), bloomFilters.get(0).getNumHashFunctions());
    // Validate bit set
    assertTrue(Arrays.equals(bloomFilters.get(0).getBitSet(), bloomFilterWrite.getBitSet()));
    // Read directly: allows better inspection of the bit sets (helped to fix a lot of bugs)
    CodedInputStream input = CodedInputStream.newInstance(bytes);
    OrcProto.BloomFilterIndex deserializedBloomFilterIndex = OrcProto.BloomFilterIndex.parseFrom(input);
    List<OrcProto.BloomFilter> bloomFilterList = deserializedBloomFilterIndex.getBloomFilterList();
    assertEquals(bloomFilterList.size(), 1);
    OrcProto.BloomFilter bloomFilterRead = bloomFilterList.get(0);
    // Validate contents of ORC bloom filter bit set
    assertTrue(Arrays.equals(Longs.toArray(bloomFilterRead.getBitsetList()), bloomFilterWrite.getBitSet()));
    // hash functions
    assertEquals(bloomFilterWrite.getNumHashFunctions(), bloomFilterRead.getNumHashFunctions());
    // bit size
    assertEquals(bloomFilterWrite.getBitSet().length, bloomFilterRead.getBitsetCount());
}
Also used : ByteArrayInputStream(java.io.ByteArrayInputStream) CodedInputStream(com.facebook.presto.orc.protobuf.CodedInputStream) InputStream(java.io.InputStream) CodedInputStream(com.facebook.presto.orc.protobuf.CodedInputStream) OrcProto(com.facebook.presto.orc.proto.OrcProto) OrcMetadataReader(com.facebook.presto.orc.metadata.OrcMetadataReader) HiveBloomFilter(com.facebook.presto.orc.metadata.HiveBloomFilter) BloomFilter(org.apache.hive.common.util.BloomFilter) TupleDomainOrcPredicate.checkInBloomFilter(com.facebook.presto.orc.TupleDomainOrcPredicate.checkInBloomFilter) ByteArrayInputStream(java.io.ByteArrayInputStream) HiveBloomFilter(com.facebook.presto.orc.metadata.HiveBloomFilter) Test(org.testng.annotations.Test)

Example 8 with OrcMetadataReader

use of com.facebook.presto.orc.metadata.OrcMetadataReader in project presto by prestodb.

the class TestOrcReaderPositions method testReadUserMetadata.

@Test
public void testReadUserMetadata() throws Exception {
    try (TempFile tempFile = new TempFile()) {
        Map<String, String> metadata = ImmutableMap.of("a", "ala", "b", "ma", "c", "kota");
        createFileWithOnlyUserMetadata(tempFile.getFile(), metadata);
        OrcDataSource orcDataSource = new FileOrcDataSource(tempFile.getFile(), new DataSize(1, DataSize.Unit.MEGABYTE), new DataSize(1, DataSize.Unit.MEGABYTE), new DataSize(1, DataSize.Unit.MEGABYTE));
        OrcReader orcReader = new OrcReader(orcDataSource, new OrcMetadataReader(), new DataSize(1, DataSize.Unit.MEGABYTE), new DataSize(1, DataSize.Unit.MEGABYTE));
        Footer footer = orcReader.getFooter();
        Map<String, String> readMetadata = Maps.transformValues(footer.getUserMetadata(), Slice::toStringAscii);
        assertEquals(readMetadata, metadata);
    }
}
Also used : TempFile(com.facebook.presto.orc.OrcTester.TempFile) Slice(io.airlift.slice.Slice) DataSize(io.airlift.units.DataSize) OrcMetadataReader(com.facebook.presto.orc.metadata.OrcMetadataReader) Footer(com.facebook.presto.orc.metadata.Footer) Test(org.testng.annotations.Test)

Example 9 with OrcMetadataReader

use of com.facebook.presto.orc.metadata.OrcMetadataReader in project presto by prestodb.

the class TestOrcReaderPositions method testRowGroupSkipping.

@Test
public void testRowGroupSkipping() throws Exception {
    try (TempFile tempFile = new TempFile()) {
        // create single strip file with multiple row groups
        int rowCount = 142_000;
        createSequentialFile(tempFile.getFile(), rowCount);
        // test reading two row groups from middle of file
        OrcPredicate predicate = (numberOfRows, statisticsByColumnIndex) -> {
            if (numberOfRows == rowCount) {
                return true;
            }
            IntegerStatistics stats = statisticsByColumnIndex.get(0).getIntegerStatistics();
            return (stats.getMin() == 50_000) || (stats.getMin() == 60_000);
        };
        OrcRecordReader reader = createCustomOrcRecordReader(tempFile, new OrcMetadataReader(), predicate, BIGINT);
        assertEquals(reader.getFileRowCount(), rowCount);
        assertEquals(reader.getReaderRowCount(), rowCount);
        assertEquals(reader.getFilePosition(), 0);
        assertEquals(reader.getReaderPosition(), 0);
        long position = 50_000;
        while (true) {
            int batchSize = reader.nextBatch();
            if (batchSize == -1) {
                break;
            }
            Block block = reader.readBlock(BIGINT, 0);
            for (int i = 0; i < batchSize; i++) {
                assertEquals(BIGINT.getLong(block, i), position + i);
            }
            assertEquals(reader.getFilePosition(), position);
            assertEquals(reader.getReaderPosition(), position);
            position += batchSize;
        }
        assertEquals(position, 70_000);
        assertEquals(reader.getFilePosition(), rowCount);
        assertEquals(reader.getReaderPosition(), rowCount);
        reader.close();
    }
}
Also used : OrcFile(org.apache.hadoop.hive.ql.io.orc.OrcFile) Block(com.facebook.presto.spi.block.Block) Slice(io.airlift.slice.Slice) OrcWriterOptions(org.apache.hadoop.hive.ql.io.orc.OrcWriterOptions) PrimitiveObjectInspectorFactory.javaLongObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaLongObjectInspector) Assert.assertEquals(org.testng.Assert.assertEquals) Writable(org.apache.hadoop.io.Writable) Test(org.testng.annotations.Test) ORC_12(com.facebook.presto.orc.OrcTester.Format.ORC_12) OrcSerde(org.apache.hadoop.hive.ql.io.orc.OrcSerde) ByteBuffer(java.nio.ByteBuffer) Writer(org.apache.hadoop.hive.ql.io.orc.Writer) BIGINT(com.facebook.presto.spi.type.BigintType.BIGINT) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) IntegerStatistics(com.facebook.presto.orc.metadata.IntegerStatistics) TempFile(com.facebook.presto.orc.OrcTester.TempFile) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) OrcTester.createOrcRecordWriter(com.facebook.presto.orc.OrcTester.createOrcRecordWriter) Path(org.apache.hadoop.fs.Path) ImmutableMap(com.google.common.collect.ImmutableMap) Footer(com.facebook.presto.orc.metadata.Footer) UTF_8(java.nio.charset.StandardCharsets.UTF_8) SNAPPY(org.apache.hadoop.hive.ql.io.orc.CompressionKind.SNAPPY) NullMemoryManager(org.apache.hadoop.hive.ql.io.orc.NullMemoryManager) OrcMetadataReader(com.facebook.presto.orc.metadata.OrcMetadataReader) IOException(java.io.IOException) Field(java.lang.reflect.Field) Maps(com.google.common.collect.Maps) File(java.io.File) SettableStructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.SettableStructObjectInspector) DataSize(io.airlift.units.DataSize) OrcTester.createCustomOrcRecordReader(com.facebook.presto.orc.OrcTester.createCustomOrcRecordReader) Serializer(org.apache.hadoop.hive.serde2.Serializer) OrcOutputFormat(org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat) SerDeException(org.apache.hadoop.hive.serde2.SerDeException) OrcTester.createSettableStructObjectInspector(com.facebook.presto.orc.OrcTester.createSettableStructObjectInspector) StructField(org.apache.hadoop.hive.serde2.objectinspector.StructField) TempFile(com.facebook.presto.orc.OrcTester.TempFile) OrcMetadataReader(com.facebook.presto.orc.metadata.OrcMetadataReader) Block(com.facebook.presto.spi.block.Block) OrcTester.createCustomOrcRecordReader(com.facebook.presto.orc.OrcTester.createCustomOrcRecordReader) IntegerStatistics(com.facebook.presto.orc.metadata.IntegerStatistics) Test(org.testng.annotations.Test)

Example 10 with OrcMetadataReader

use of com.facebook.presto.orc.metadata.OrcMetadataReader in project presto by prestodb.

the class OrcStorageManager method computeShardStats.

private List<ColumnStats> computeShardStats(File file) {
    try (OrcDataSource dataSource = fileOrcDataSource(defaultReaderAttributes, file)) {
        OrcReader reader = new OrcReader(dataSource, new OrcMetadataReader(), defaultReaderAttributes.getMaxMergeDistance(), defaultReaderAttributes.getMaxReadSize());
        ImmutableList.Builder<ColumnStats> list = ImmutableList.builder();
        for (ColumnInfo info : getColumnInfo(reader)) {
            computeColumnStats(reader, info.getColumnId(), info.getType()).ifPresent(list::add);
        }
        return list.build();
    } catch (IOException e) {
        throw new PrestoException(RAPTOR_ERROR, "Failed to read file: " + file, e);
    }
}
Also used : FileOrcDataSource(com.facebook.presto.orc.FileOrcDataSource) OrcDataSource(com.facebook.presto.orc.OrcDataSource) OrcReader(com.facebook.presto.orc.OrcReader) ImmutableList(com.google.common.collect.ImmutableList) ColumnStats(com.facebook.presto.raptor.metadata.ColumnStats) ShardStats.computeColumnStats(com.facebook.presto.raptor.storage.ShardStats.computeColumnStats) OrcMetadataReader(com.facebook.presto.orc.metadata.OrcMetadataReader) ColumnInfo(com.facebook.presto.raptor.metadata.ColumnInfo) PrestoException(com.facebook.presto.spi.PrestoException) IOException(java.io.IOException)

Aggregations

OrcMetadataReader (com.facebook.presto.orc.metadata.OrcMetadataReader)11 DataSize (io.airlift.units.DataSize)5 Test (org.testng.annotations.Test)5 OrcReader (com.facebook.presto.orc.OrcReader)4 TempFile (com.facebook.presto.orc.OrcTester.TempFile)4 IOException (java.io.IOException)4 OrcTester.createCustomOrcRecordReader (com.facebook.presto.orc.OrcTester.createCustomOrcRecordReader)3 Footer (com.facebook.presto.orc.metadata.Footer)3 Block (com.facebook.presto.spi.block.Block)3 ImmutableMap (com.google.common.collect.ImmutableMap)3 Slice (io.airlift.slice.Slice)3 FileOrcDataSource (com.facebook.presto.orc.FileOrcDataSource)2 OrcDataSource (com.facebook.presto.orc.OrcDataSource)2 ORC_12 (com.facebook.presto.orc.OrcTester.Format.ORC_12)2 OrcTester.createOrcRecordWriter (com.facebook.presto.orc.OrcTester.createOrcRecordWriter)2 OrcTester.createSettableStructObjectInspector (com.facebook.presto.orc.OrcTester.createSettableStructObjectInspector)2 AggregatedMemoryContext (com.facebook.presto.orc.memory.AggregatedMemoryContext)2 IntegerStatistics (com.facebook.presto.orc.metadata.IntegerStatistics)2 PrestoException (com.facebook.presto.spi.PrestoException)2 BIGINT (com.facebook.presto.spi.type.BigintType.BIGINT)2