Search in sources :

Example 1 with OrcMetadataReader

use of io.prestosql.orc.metadata.OrcMetadataReader in project hetu-core by openlookeng.

the class OrcFileTail method readFrom.

public static OrcFileTail readFrom(OrcDataSource orcDataSource, Optional<OrcWriteValidation> writeValidation) throws IOException {
    OrcFileTail orcFileTail = new OrcFileTail();
    // 
    // Read the file tail:
    // 
    // variable: Footer
    // variable: Metadata
    // variable: PostScript - contains length of footer and metadata
    // 1 byte: postScriptSize
    // figure out the size of the file using the option or filesystem
    long size = orcDataSource.getSize();
    if (size <= PostScript.MAGIC.length()) {
        throw new OrcCorruptionException(orcDataSource.getId(), "Invalid file size %s", size);
    }
    // Read the tail of the file
    int expectedBufferSize = toIntExact(min(size, EXPECTED_FOOTER_SIZE));
    Slice buffer = orcDataSource.readFully(size - expectedBufferSize, expectedBufferSize);
    // get length of PostScript - last byte of the file
    int postScriptSize = buffer.getUnsignedByte(buffer.length() - SIZE_OF_BYTE);
    if (postScriptSize >= buffer.length()) {
        throw new OrcCorruptionException(orcDataSource.getId(), "Invalid postscript length %s", postScriptSize);
    }
    MetadataReader metadataReader = new ExceptionWrappingMetadataReader(orcDataSource.getId(), new OrcMetadataReader());
    // decode the post script
    try {
        orcFileTail.postScript = metadataReader.readPostScript(buffer.slice(buffer.length() - SIZE_OF_BYTE - postScriptSize, postScriptSize).getInput());
    } catch (OrcCorruptionException e) {
        // check if this is an ORC file and not an RCFile or something else
        if (!isValidHeaderMagic(orcDataSource)) {
            throw new OrcCorruptionException(orcDataSource.getId(), "Not an ORC file");
        }
        throw e;
    }
    // verify this is a supported version
    checkOrcVersion(orcDataSource, orcFileTail.postScript.getVersion());
    validateWrite(validation -> validation.getVersion().equals(orcFileTail.postScript.getVersion()), writeValidation, orcDataSource, "Unexpected version");
    int bufferSize = toIntExact(orcFileTail.postScript.getCompressionBlockSize());
    // check compression codec is supported
    CompressionKind compressionKind = orcFileTail.postScript.getCompression();
    orcFileTail.decompressor = OrcDecompressor.createOrcDecompressor(orcDataSource.getId(), compressionKind, bufferSize);
    validateWrite(validation -> validation.getCompression() == compressionKind, writeValidation, orcDataSource, "Unexpected compression");
    PostScript.HiveWriterVersion hiveWriterVersion = orcFileTail.postScript.getHiveWriterVersion();
    int footerSize = toIntExact(orcFileTail.postScript.getFooterLength());
    int metadataSize = toIntExact(orcFileTail.postScript.getMetadataLength());
    // check if extra bytes need to be read
    Slice completeFooterSlice;
    int completeFooterSize = footerSize + metadataSize + postScriptSize + SIZE_OF_BYTE;
    if (completeFooterSize > buffer.length()) {
        // initial read was not large enough, so just read again with the correct size
        completeFooterSlice = orcDataSource.readFully(size - completeFooterSize, completeFooterSize);
    } else {
        // footer is already in the bytes in buffer, just adjust position, length
        completeFooterSlice = buffer.slice(buffer.length() - completeFooterSize, completeFooterSize);
    }
    // read metadata
    Slice metadataSlice = completeFooterSlice.slice(0, metadataSize);
    try (InputStream metadataInputStream = new OrcInputStream(OrcChunkLoader.create(orcDataSource.getId(), metadataSlice, orcFileTail.decompressor, newSimpleAggregatedMemoryContext()))) {
        orcFileTail.metadata = metadataReader.readMetadata(hiveWriterVersion, metadataInputStream);
    }
    // read footer
    Slice footerSlice = completeFooterSlice.slice(metadataSize, footerSize);
    try (InputStream footerInputStream = new OrcInputStream(OrcChunkLoader.create(orcDataSource.getId(), footerSlice, orcFileTail.decompressor, newSimpleAggregatedMemoryContext()))) {
        orcFileTail.footer = metadataReader.readFooter(hiveWriterVersion, footerInputStream);
    }
    if (orcFileTail.footer.getTypes().size() == 0) {
        throw new OrcCorruptionException(orcDataSource.getId(), "File has no columns");
    }
    validateWrite(validation -> validation.getColumnNames().equals(orcFileTail.footer.getTypes().get(new OrcColumnId(0)).getFieldNames()), writeValidation, orcDataSource, "Unexpected column names");
    validateWrite(validation -> validation.getRowGroupMaxRowCount() == orcFileTail.footer.getRowsInRowGroup(), writeValidation, orcDataSource, "Unexpected rows in group");
    if (writeValidation.isPresent()) {
        writeValidation.get().validateMetadata(orcDataSource.getId(), orcFileTail.footer.getUserMetadata());
        writeValidation.get().validateFileStatistics(orcDataSource.getId(), orcFileTail.footer.getFileStats());
        writeValidation.get().validateStripeStatistics(orcDataSource.getId(), orcFileTail.footer.getStripes(), orcFileTail.metadata.getStripeStatsList());
    }
    return orcFileTail;
}
Also used : OrcColumnId(io.prestosql.orc.metadata.OrcColumnId) CompressionKind(io.prestosql.orc.metadata.CompressionKind) OrcInputStream(io.prestosql.orc.stream.OrcInputStream) OrcInputStream(io.prestosql.orc.stream.OrcInputStream) InputStream(java.io.InputStream) OrcMetadataReader(io.prestosql.orc.metadata.OrcMetadataReader) ExceptionWrappingMetadataReader(io.prestosql.orc.metadata.ExceptionWrappingMetadataReader) OrcMetadataReader(io.prestosql.orc.metadata.OrcMetadataReader) MetadataReader(io.prestosql.orc.metadata.MetadataReader) PostScript(io.prestosql.orc.metadata.PostScript) Slice(io.airlift.slice.Slice) ExceptionWrappingMetadataReader(io.prestosql.orc.metadata.ExceptionWrappingMetadataReader)

Example 2 with OrcMetadataReader

use of io.prestosql.orc.metadata.OrcMetadataReader in project hetu-core by openlookeng.

the class TestOrcBloomFilters method testOrcHiveBloomFilterSerde.

@Test
public void testOrcHiveBloomFilterSerde() throws Exception {
    HashableBloomFilter bloomFilterWrite = new HashableBloomFilter(1000L, 0.05);
    bloomFilterWrite.add(TEST_STRING);
    assertTrue(bloomFilterWrite.test(TEST_STRING));
    assertTrue(bloomFilterWrite.test(wrappedBuffer(TEST_STRING)));
    OrcProto.BloomFilter.Builder bloomFilterBuilder = OrcProto.BloomFilter.newBuilder();
    bloomFilterBuilder.addAllBitset(Longs.asList(bloomFilterWrite.getBitSet()));
    bloomFilterBuilder.setNumHashFunctions(bloomFilterWrite.getNumHashFunctions());
    OrcProto.BloomFilter bloomFilter = bloomFilterBuilder.build();
    OrcProto.BloomFilterIndex bloomFilterIndex = OrcProto.BloomFilterIndex.getDefaultInstance();
    byte[] bytes = serializeBloomFilterToIndex(bloomFilter, bloomFilterIndex);
    // Read through method
    InputStream inputStream = new ByteArrayInputStream(bytes);
    OrcMetadataReader metadataReader = new OrcMetadataReader();
    List<HashableBloomFilter> bloomFilters = metadataReader.readBloomFilterIndexes(inputStream);
    assertEquals(bloomFilters.size(), 1);
    assertTrue(bloomFilters.get(0).test(TEST_STRING));
    assertTrue(bloomFilters.get(0).test(wrappedBuffer(TEST_STRING)));
    assertFalse(bloomFilters.get(0).test(TEST_STRING_NOT_WRITTEN));
    assertFalse(bloomFilters.get(0).test(wrappedBuffer(TEST_STRING_NOT_WRITTEN)));
    assertEquals(bloomFilterWrite.getNumBits(), bloomFilters.get(0).getNumBits());
    assertEquals(bloomFilterWrite.getNumHashFunctions(), bloomFilters.get(0).getNumHashFunctions());
    // Validate bit set
    assertTrue(Arrays.equals(bloomFilters.get(0).getBitSet(), bloomFilterWrite.getBitSet()));
    // Read directly: allows better inspection of the bit sets (helped to fix a lot of bugs)
    CodedInputStream input = CodedInputStream.newInstance(bytes);
    OrcProto.BloomFilterIndex deserializedBloomFilterIndex = OrcProto.BloomFilterIndex.parseFrom(input);
    List<OrcProto.BloomFilter> bloomFilterList = deserializedBloomFilterIndex.getBloomFilterList();
    assertEquals(bloomFilterList.size(), 1);
    OrcProto.BloomFilter bloomFilterRead = bloomFilterList.get(0);
    // Validate contents of ORC bloom filter bit set
    assertTrue(Arrays.equals(Longs.toArray(bloomFilterRead.getBitsetList()), bloomFilterWrite.getBitSet()));
    // hash functions
    assertEquals(bloomFilterWrite.getNumHashFunctions(), bloomFilterRead.getNumHashFunctions());
    // bit size
    assertEquals(bloomFilterWrite.getBitSet().length, bloomFilterRead.getBitsetCount());
}
Also used : CodedInputStream(io.prestosql.orc.protobuf.CodedInputStream) ByteArrayInputStream(java.io.ByteArrayInputStream) InputStream(java.io.InputStream) CodedInputStream(io.prestosql.orc.protobuf.CodedInputStream) OrcProto(io.prestosql.orc.proto.OrcProto) OrcMetadataReader(io.prestosql.orc.metadata.OrcMetadataReader) HashableBloomFilter(io.prestosql.orc.metadata.statistics.HashableBloomFilter) TupleDomainOrcPredicate.checkInBloomFilter(io.prestosql.orc.TupleDomainOrcPredicate.checkInBloomFilter) HashableBloomFilter(io.prestosql.orc.metadata.statistics.HashableBloomFilter) ByteArrayInputStream(java.io.ByteArrayInputStream) Test(org.testng.annotations.Test)

Example 3 with OrcMetadataReader

use of io.prestosql.orc.metadata.OrcMetadataReader in project hetu-core by openlookeng.

the class TestOrcWriter method testWriteOutputStreamsInOrder.

@Test
public void testWriteOutputStreamsInOrder() throws IOException {
    for (OrcWriteValidationMode validationMode : OrcWriteValidationMode.values()) {
        TempFile tempFile = new TempFile();
        OrcWriter writer = new OrcWriter(new OutputStreamOrcDataSink(new FileOutputStream(tempFile.getFile())), ImmutableList.of("test1", "test2", "test3", "test4", "test5"), ImmutableList.of(VARCHAR, VARCHAR, VARCHAR, VARCHAR, VARCHAR), NONE, new OrcWriterOptions().withStripeMinSize(new DataSize(0, MEGABYTE)).withStripeMaxSize(new DataSize(32, MEGABYTE)).withStripeMaxRowCount(ORC_STRIPE_SIZE).withRowGroupMaxRowCount(ORC_ROW_GROUP_SIZE).withDictionaryMaxMemory(new DataSize(32, MEGABYTE)), false, ImmutableMap.of(), true, validationMode, new OrcWriterStats(), Optional.empty(), Optional.empty());
        // write down some data with unsorted streams
        String[] data = new String[] { "a", "bbbbb", "ccc", "dd", "eeee" };
        Block[] blocks = new Block[data.length];
        int entries = 65536;
        BlockBuilder blockBuilder = VARCHAR.createBlockBuilder(null, entries);
        for (int i = 0; i < data.length; i++) {
            byte[] bytes = data[i].getBytes();
            for (int j = 0; j < entries; j++) {
                // force to write different data
                bytes[0] = (byte) ((bytes[0] + 1) % 128);
                blockBuilder.writeBytes(Slices.wrappedBuffer(bytes, 0, bytes.length), 0, bytes.length);
                blockBuilder.closeEntry();
            }
            blocks[i] = blockBuilder.build();
            blockBuilder = blockBuilder.newBlockBuilderLike(null);
        }
        writer.write(new Page(blocks));
        writer.close();
        // read the footer and verify the streams are ordered by size
        DataSize dataSize = new DataSize(1, MEGABYTE);
        OrcDataSource orcDataSource = new FileOrcDataSource(tempFile.getFile(), dataSize, dataSize, dataSize, true, tempFile.getFile().lastModified());
        Footer footer = new OrcReader(orcDataSource, dataSize, dataSize, dataSize).getFooter();
        for (StripeInformation stripe : footer.getStripes()) {
            // read the footer
            Slice tailBuffer = orcDataSource.readFully(stripe.getOffset() + stripe.getIndexLength() + stripe.getDataLength(), toIntExact(stripe.getFooterLength()));
            try (InputStream inputStream = new OrcInputStream(OrcChunkLoader.create(orcDataSource.getId(), tailBuffer, Optional.empty(), newSimpleAggregatedMemoryContext()))) {
                StripeFooter stripeFooter = new OrcMetadataReader().readStripeFooter(footer.getTypes(), inputStream, ZoneId.of("UTC"));
                int size = 0;
                boolean dataStreamStarted = false;
                for (Stream stream : stripeFooter.getStreams()) {
                    if (isIndexStream(stream)) {
                        assertFalse(dataStreamStarted);
                        continue;
                    }
                    dataStreamStarted = true;
                    // verify sizes in order
                    assertGreaterThanOrEqual(stream.getLength(), size);
                    size = stream.getLength();
                }
            }
        }
    }
}
Also used : Page(io.prestosql.spi.Page) DataSize(io.airlift.units.DataSize) OrcWriteValidationMode(io.prestosql.orc.OrcWriteValidation.OrcWriteValidationMode) StripeReader.isIndexStream(io.prestosql.orc.StripeReader.isIndexStream) OrcInputStream(io.prestosql.orc.stream.OrcInputStream) FileOutputStream(java.io.FileOutputStream) Stream(io.prestosql.orc.metadata.Stream) InputStream(java.io.InputStream) BlockBuilder(io.prestosql.spi.block.BlockBuilder) OrcInputStream(io.prestosql.orc.stream.OrcInputStream) OrcInputStream(io.prestosql.orc.stream.OrcInputStream) InputStream(java.io.InputStream) OrcMetadataReader(io.prestosql.orc.metadata.OrcMetadataReader) StripeFooter(io.prestosql.orc.metadata.StripeFooter) Slice(io.airlift.slice.Slice) FileOutputStream(java.io.FileOutputStream) Footer(io.prestosql.orc.metadata.Footer) StripeFooter(io.prestosql.orc.metadata.StripeFooter) Block(io.prestosql.spi.block.Block) StripeInformation(io.prestosql.orc.metadata.StripeInformation) Test(org.testng.annotations.Test)

Aggregations

OrcMetadataReader (io.prestosql.orc.metadata.OrcMetadataReader)3 InputStream (java.io.InputStream)3 Slice (io.airlift.slice.Slice)2 OrcInputStream (io.prestosql.orc.stream.OrcInputStream)2 Test (org.testng.annotations.Test)2 DataSize (io.airlift.units.DataSize)1 OrcWriteValidationMode (io.prestosql.orc.OrcWriteValidation.OrcWriteValidationMode)1 StripeReader.isIndexStream (io.prestosql.orc.StripeReader.isIndexStream)1 TupleDomainOrcPredicate.checkInBloomFilter (io.prestosql.orc.TupleDomainOrcPredicate.checkInBloomFilter)1 CompressionKind (io.prestosql.orc.metadata.CompressionKind)1 ExceptionWrappingMetadataReader (io.prestosql.orc.metadata.ExceptionWrappingMetadataReader)1 Footer (io.prestosql.orc.metadata.Footer)1 MetadataReader (io.prestosql.orc.metadata.MetadataReader)1 OrcColumnId (io.prestosql.orc.metadata.OrcColumnId)1 PostScript (io.prestosql.orc.metadata.PostScript)1 Stream (io.prestosql.orc.metadata.Stream)1 StripeFooter (io.prestosql.orc.metadata.StripeFooter)1 StripeInformation (io.prestosql.orc.metadata.StripeInformation)1 HashableBloomFilter (io.prestosql.orc.metadata.statistics.HashableBloomFilter)1 OrcProto (io.prestosql.orc.proto.OrcProto)1