Search in sources :

Example 1 with DwrfStripeCacheData

use of com.facebook.presto.orc.metadata.DwrfStripeCacheData in project presto by prestodb.

the class OrcWriter method bufferFileFooter.

/**
 * Collect the data for for the file footer.  This is not the actual data, but
 * instead are functions that know how to write the data.
 */
private List<DataOutput> bufferFileFooter() throws IOException {
    List<DataOutput> outputData = new ArrayList<>();
    Metadata metadata = new Metadata(closedStripes.stream().map(ClosedStripe::getStatistics).collect(toList()));
    Slice metadataSlice = metadataWriter.writeMetadata(metadata);
    outputData.add(createDataOutput(metadataSlice));
    numberOfRows = closedStripes.stream().mapToLong(stripe -> stripe.getStripeInformation().getNumberOfRows()).sum();
    List<ColumnStatistics> fileStats = toFileStats(closedStripes.stream().map(ClosedStripe::getStatistics).map(StripeStatistics::getColumnStatistics).collect(toList()));
    recordValidation(validation -> validation.setFileStatistics(fileStats));
    Map<String, Slice> userMetadata = this.userMetadata.entrySet().stream().collect(Collectors.toMap(Entry::getKey, entry -> utf8Slice(entry.getValue())));
    unencryptedStats = new ArrayList<>();
    Map<Integer, Map<Integer, Slice>> encryptedStats = new HashMap<>();
    addStatsRecursive(fileStats, 0, new HashMap<>(), unencryptedStats, encryptedStats);
    Optional<DwrfEncryption> dwrfEncryption;
    if (dwrfWriterEncryption.isPresent()) {
        ImmutableList.Builder<EncryptionGroup> encryptionGroupBuilder = ImmutableList.builder();
        List<WriterEncryptionGroup> writerEncryptionGroups = dwrfWriterEncryption.get().getWriterEncryptionGroups();
        for (int i = 0; i < writerEncryptionGroups.size(); i++) {
            WriterEncryptionGroup group = writerEncryptionGroups.get(i);
            Map<Integer, Slice> groupStats = encryptedStats.get(i);
            encryptionGroupBuilder.add(new EncryptionGroup(group.getNodes(), // reader will just use key metadata from the stripe
            Optional.empty(), group.getNodes().stream().map(groupStats::get).collect(toList())));
        }
        dwrfEncryption = Optional.of(new DwrfEncryption(dwrfWriterEncryption.get().getKeyProvider(), encryptionGroupBuilder.build()));
    } else {
        dwrfEncryption = Optional.empty();
    }
    Optional<DwrfStripeCacheData> dwrfStripeCacheData = dwrfStripeCacheWriter.map(DwrfStripeCacheWriter::getDwrfStripeCacheData);
    Slice dwrfStripeCacheSlice = metadataWriter.writeDwrfStripeCache(dwrfStripeCacheData);
    outputData.add(createDataOutput(dwrfStripeCacheSlice));
    Optional<List<Integer>> dwrfStripeCacheOffsets = dwrfStripeCacheWriter.map(DwrfStripeCacheWriter::getOffsets);
    Footer footer = new Footer(numberOfRows, rowGroupMaxRowCount, OptionalLong.of(rawSize), closedStripes.stream().map(ClosedStripe::getStripeInformation).collect(toList()), orcTypes, ImmutableList.copyOf(unencryptedStats), userMetadata, dwrfEncryption, dwrfStripeCacheOffsets);
    closedStripes.clear();
    closedStripesRetainedBytes = 0;
    Slice footerSlice = metadataWriter.writeFooter(footer);
    outputData.add(createDataOutput(footerSlice));
    recordValidation(validation -> validation.setVersion(metadataWriter.getOrcMetadataVersion()));
    Slice postscriptSlice = metadataWriter.writePostscript(footerSlice.length(), metadataSlice.length(), columnWriterOptions.getCompressionKind(), columnWriterOptions.getCompressionMaxBufferSize(), dwrfStripeCacheData);
    outputData.add(createDataOutput(postscriptSlice));
    outputData.add(createDataOutput(Slices.wrappedBuffer((byte) postscriptSlice.length())));
    return outputData;
}
Also used : ArrayListMultimap(com.google.common.collect.ArrayListMultimap) Page(com.facebook.presto.common.Page) DateTimeZone(org.joda.time.DateTimeZone) DwrfMetadataWriter.toFileStatistics(com.facebook.presto.orc.metadata.DwrfMetadataWriter.toFileStatistics) StripeEncryptionGroup(com.facebook.presto.orc.metadata.StripeEncryptionGroup) StreamLayout(com.facebook.presto.orc.writer.StreamLayout) ColumnWriter(com.facebook.presto.orc.writer.ColumnWriter) DwrfEncryption(com.facebook.presto.orc.metadata.DwrfEncryption) DataSink(com.facebook.presto.common.io.DataSink) Preconditions.checkArgument(com.google.common.base.Preconditions.checkArgument) DictionaryColumnWriter(com.facebook.presto.orc.writer.DictionaryColumnWriter) DIRECT(com.facebook.presto.orc.metadata.ColumnEncoding.ColumnEncodingKind.DIRECT) DwrfStripeCacheWriter(com.facebook.presto.orc.metadata.DwrfStripeCacheWriter) Slices(io.airlift.slice.Slices) Map(java.util.Map) StripeInformation(com.facebook.presto.orc.metadata.StripeInformation) Slices.utf8Slice(io.airlift.slice.Slices.utf8Slice) ImmutableSet(com.google.common.collect.ImmutableSet) ImmutableMap(com.google.common.collect.ImmutableMap) CompressedMetadataWriter(com.facebook.presto.orc.metadata.CompressedMetadataWriter) Footer(com.facebook.presto.orc.metadata.Footer) UNENCRYPTED(com.facebook.presto.orc.DwrfEncryptionInfo.UNENCRYPTED) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) Collectors(java.util.stream.Collectors) ColumnStatistics(com.facebook.presto.orc.metadata.statistics.ColumnStatistics) Preconditions.checkState(com.google.common.base.Preconditions.checkState) ColumnEncoding(com.facebook.presto.orc.metadata.ColumnEncoding) DataSize(io.airlift.units.DataSize) List(java.util.List) DwrfMetadataWriter.toStripeEncryptionGroup(com.facebook.presto.orc.metadata.DwrfMetadataWriter.toStripeEncryptionGroup) ImmutableMap.toImmutableMap(com.google.common.collect.ImmutableMap.toImmutableMap) ClassLayout(org.openjdk.jol.info.ClassLayout) DWRF(com.facebook.presto.orc.OrcEncoding.DWRF) CompressionKind(com.facebook.presto.orc.metadata.CompressionKind) Entry(java.util.Map.Entry) Optional(java.util.Optional) Metadata(com.facebook.presto.orc.metadata.Metadata) IntStream(java.util.stream.IntStream) Slice(io.airlift.slice.Slice) MEGABYTE(io.airlift.units.DataSize.Unit.MEGABYTE) OrcWriteValidationMode(com.facebook.presto.orc.OrcWriteValidation.OrcWriteValidationMode) HashMap(java.util.HashMap) CLOSED(com.facebook.presto.orc.FlushReason.CLOSED) Multimap(com.google.common.collect.Multimap) TreeSet(java.util.TreeSet) ArrayList(java.util.ArrayList) DynamicSliceOutput(io.airlift.slice.DynamicSliceOutput) OptionalLong(java.util.OptionalLong) ImmutableList(com.google.common.collect.ImmutableList) MAGIC(com.facebook.presto.orc.metadata.PostScript.MAGIC) Verify.verify(com.google.common.base.Verify.verify) Objects.requireNonNull(java.util.Objects.requireNonNull) DataOutput.createDataOutput(com.facebook.presto.common.io.DataOutput.createDataOutput) LastUsedCompressionBufferPool(com.facebook.presto.orc.writer.CompressionBufferPool.LastUsedCompressionBufferPool) OrcType(com.facebook.presto.orc.metadata.OrcType) StreamDataOutput(com.facebook.presto.orc.stream.StreamDataOutput) Math.toIntExact(java.lang.Math.toIntExact) Type(com.facebook.presto.common.type.Type) Nullable(javax.annotation.Nullable) Integer.min(java.lang.Integer.min) ColumnWriters.createColumnWriter(com.facebook.presto.orc.writer.ColumnWriters.createColumnWriter) StripeStatistics(com.facebook.presto.orc.metadata.statistics.StripeStatistics) OrcReader.validateFile(com.facebook.presto.orc.OrcReader.validateFile) OrcWriteValidationBuilder(com.facebook.presto.orc.OrcWriteValidation.OrcWriteValidationBuilder) IOException(java.io.IOException) DwrfStripeCacheData(com.facebook.presto.orc.metadata.DwrfStripeCacheData) Stream(com.facebook.presto.orc.metadata.Stream) Consumer(java.util.function.Consumer) EncryptionGroup(com.facebook.presto.orc.metadata.EncryptionGroup) Collectors.toList(java.util.stream.Collectors.toList) StripeFooter(com.facebook.presto.orc.metadata.StripeFooter) CompressionBufferPool(com.facebook.presto.orc.writer.CompressionBufferPool) Closeable(java.io.Closeable) DwrfProto(com.facebook.presto.orc.proto.DwrfProto) VisibleForTesting(com.google.common.annotations.VisibleForTesting) DataOutput(com.facebook.presto.common.io.DataOutput) DwrfEncryptionInfo.createNodeToGroupMap(com.facebook.presto.orc.DwrfEncryptionInfo.createNodeToGroupMap) DataOutput.createDataOutput(com.facebook.presto.common.io.DataOutput.createDataOutput) StreamDataOutput(com.facebook.presto.orc.stream.StreamDataOutput) DataOutput(com.facebook.presto.common.io.DataOutput) HashMap(java.util.HashMap) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) ImmutableList(com.google.common.collect.ImmutableList) ArrayList(java.util.ArrayList) Metadata(com.facebook.presto.orc.metadata.Metadata) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) List(java.util.List) ArrayList(java.util.ArrayList) ImmutableList(com.google.common.collect.ImmutableList) Collectors.toList(java.util.stream.Collectors.toList) DwrfStripeCacheWriter(com.facebook.presto.orc.metadata.DwrfStripeCacheWriter) ColumnStatistics(com.facebook.presto.orc.metadata.statistics.ColumnStatistics) StripeStatistics(com.facebook.presto.orc.metadata.statistics.StripeStatistics) StripeEncryptionGroup(com.facebook.presto.orc.metadata.StripeEncryptionGroup) DwrfMetadataWriter.toStripeEncryptionGroup(com.facebook.presto.orc.metadata.DwrfMetadataWriter.toStripeEncryptionGroup) EncryptionGroup(com.facebook.presto.orc.metadata.EncryptionGroup) Slices.utf8Slice(io.airlift.slice.Slices.utf8Slice) Slice(io.airlift.slice.Slice) Footer(com.facebook.presto.orc.metadata.Footer) StripeFooter(com.facebook.presto.orc.metadata.StripeFooter) DwrfStripeCacheData(com.facebook.presto.orc.metadata.DwrfStripeCacheData) Map(java.util.Map) ImmutableMap(com.google.common.collect.ImmutableMap) ImmutableMap.toImmutableMap(com.google.common.collect.ImmutableMap.toImmutableMap) HashMap(java.util.HashMap) DwrfEncryptionInfo.createNodeToGroupMap(com.facebook.presto.orc.DwrfEncryptionInfo.createNodeToGroupMap) DwrfEncryption(com.facebook.presto.orc.metadata.DwrfEncryption)

Example 2 with DwrfStripeCacheData

use of com.facebook.presto.orc.metadata.DwrfStripeCacheData in project presto by prestodb.

the class TestStorageOrcFileTailSource method testReadDwrfStripeCacheIfEnabled.

@Test
public void testReadDwrfStripeCacheIfEnabled() throws IOException {
    FileOutputStream out = new FileOutputStream(file.getFile());
    // write a fake stripe cache
    byte[] stripeCache = new byte[100];
    for (int i = 0; i < stripeCache.length; i++) {
        stripeCache[i] = (byte) i;
    }
    out.write(stripeCache);
    // write the footer and post script
    DwrfProto.Footer.Builder footer = DwrfProto.Footer.newBuilder().addAllStripeCacheOffsets(ImmutableList.of(1, 2, 3));
    DwrfProto.PostScript.Builder postScript = DwrfProto.PostScript.newBuilder().setCompression(NONE).setCacheMode(BOTH).setCacheSize(stripeCache.length);
    writeTail(footer, postScript, out);
    out.close();
    // read the file tail with the enabled "read dwrf stripe cache" feature
    StorageOrcFileTailSource src = new StorageOrcFileTailSource(FOOTER_READ_SIZE_IN_BYTES, true);
    OrcDataSource orcDataSource = createFileOrcDataSource();
    OrcFileTail orcFileTail = src.getOrcFileTail(orcDataSource, metadataReader, Optional.empty(), false);
    assertEquals(orcFileTail.getMetadataSize(), 0);
    DwrfProto.Footer actualFooter = readFooter(orcFileTail);
    assertEquals(actualFooter, footer.build());
    // make sure the stripe cache is loaded correctly
    assertTrue(orcFileTail.getDwrfStripeCacheData().isPresent());
    DwrfStripeCacheData dwrfStripeCacheData = orcFileTail.getDwrfStripeCacheData().get();
    assertEquals(dwrfStripeCacheData.getDwrfStripeCacheMode(), INDEX_AND_FOOTER);
    assertEquals(dwrfStripeCacheData.getDwrfStripeCacheSize(), stripeCache.length);
    assertEquals(dwrfStripeCacheData.getDwrfStripeCacheSlice().getBytes(), stripeCache);
}
Also used : StorageOrcFileTailSource(com.facebook.presto.orc.cache.StorageOrcFileTailSource) DwrfProto(com.facebook.presto.orc.proto.DwrfProto) OrcFileTail(com.facebook.presto.orc.metadata.OrcFileTail) FileOutputStream(java.io.FileOutputStream) DwrfStripeCacheData(com.facebook.presto.orc.metadata.DwrfStripeCacheData) Test(org.testng.annotations.Test)

Example 3 with DwrfStripeCacheData

use of com.facebook.presto.orc.metadata.DwrfStripeCacheData in project presto by prestodb.

the class StorageOrcFileTailSource method getOrcFileTail.

@Override
public OrcFileTail getOrcFileTail(OrcDataSource orcDataSource, MetadataReader metadataReader, Optional<OrcWriteValidation> writeValidation, boolean cacheable) throws IOException {
    long size = orcDataSource.getSize();
    if (size <= MAGIC.length()) {
        throw new OrcCorruptionException(orcDataSource.getId(), "Invalid file size %s", size);
    }
    // Read the tail of the file
    byte[] buffer = new byte[toIntExact(min(size, expectedFooterSizeInBytes))];
    orcDataSource.readFully(size - buffer.length, buffer);
    // get length of PostScript - last byte of the file
    int postScriptSize = buffer[buffer.length - SIZE_OF_BYTE] & 0xff;
    if (postScriptSize >= buffer.length) {
        throw new OrcCorruptionException(orcDataSource.getId(), "Invalid postscript length %s", postScriptSize);
    }
    // decode the post script
    PostScript postScript;
    try {
        postScript = metadataReader.readPostScript(buffer, buffer.length - SIZE_OF_BYTE - postScriptSize, postScriptSize);
    } catch (OrcCorruptionException e) {
        // check if this is an ORC file and not an RCFile or something else
        if (!isValidHeaderMagic(orcDataSource)) {
            throw new OrcCorruptionException(orcDataSource.getId(), "Not an ORC file");
        }
        throw e;
    }
    // verify this is a supported version
    checkOrcVersion(orcDataSource, postScript.getVersion());
    validateWrite(writeValidation, orcDataSource, validation -> validation.getVersion().equals(postScript.getVersion()), "Unexpected version");
    int bufferSize = toIntExact(postScript.getCompressionBlockSize());
    // check compression codec is supported
    CompressionKind compressionKind = postScript.getCompression();
    validateWrite(writeValidation, orcDataSource, validation -> validation.getCompression() == compressionKind, "Unexpected compression");
    PostScript.HiveWriterVersion hiveWriterVersion = postScript.getHiveWriterVersion();
    int footerSize = toIntExact(postScript.getFooterLength());
    int metadataSize = toIntExact(postScript.getMetadataLength());
    if (footerSize < 0) {
        throw new OrcCorruptionException(orcDataSource.getId(), "Invalid footer length %s", footerSize);
    }
    if (metadataSize < 0) {
        throw new OrcCorruptionException(orcDataSource.getId(), "Invalid metadata length %s", metadataSize);
    }
    // read DWRF stripe cache only if this feature is enabled and it has meaningful data
    boolean readDwrfStripeCache = dwrfStripeCacheEnabled && postScript.getDwrfStripeCacheLength().isPresent() && postScript.getDwrfStripeCacheMode().isPresent() && postScript.getDwrfStripeCacheMode().get() != DwrfStripeCacheMode.NONE;
    int dwrfStripeCacheSize = 0;
    if (readDwrfStripeCache) {
        dwrfStripeCacheSize = postScript.getDwrfStripeCacheLength().getAsInt();
        checkSizes(orcDataSource, metadataSize, dwrfStripeCacheSize);
    }
    // check if extra bytes need to be read
    Slice completeFooterSlice;
    int completeFooterSize = dwrfStripeCacheSize + metadataSize + footerSize + postScriptSize + SIZE_OF_BYTE;
    if (completeFooterSize > buffer.length) {
        // allocate a new buffer large enough for the complete footer
        byte[] newBuffer = new byte[completeFooterSize];
        completeFooterSlice = Slices.wrappedBuffer(newBuffer);
        // initial read was not large enough, so read missing section
        orcDataSource.readFully(size - completeFooterSize, newBuffer, 0, completeFooterSize - buffer.length);
        // copy already read bytes into the new buffer
        completeFooterSlice.setBytes(completeFooterSize - buffer.length, buffer);
    } else {
        // footer is already in the bytes in buffer, just adjust position, length
        completeFooterSlice = Slices.wrappedBuffer(buffer, buffer.length - completeFooterSize, completeFooterSize);
    }
    // metadataSize is set only for ORC files, dwrfStripeCacheSize is set only for DWRF files
    // it should be safe to sum them up to find footer offset
    // TAIL: [ ORC_METADATA{0,1} | DWRF_STRIPE_CACHE {0,1} ] + FOOTER + POST_SCRIPT + POST_SCRIPT_SIZE (1 byte)
    int footerSliceOffset = metadataSize + dwrfStripeCacheSize;
    Slice footerSlice = completeFooterSlice.slice(footerSliceOffset, footerSize);
    Slice metadataSlice = completeFooterSlice.slice(0, metadataSize);
    // set DwrfStripeCacheData only if the stripe cache feature is enabled and the file has the stripe cache
    Optional<DwrfStripeCacheData> dwrfStripeCacheData = Optional.empty();
    if (readDwrfStripeCache) {
        Slice dwrfStripeCacheSlice = completeFooterSlice.slice(0, dwrfStripeCacheSize);
        DwrfStripeCacheMode stripeCacheMode = postScript.getDwrfStripeCacheMode().get();
        dwrfStripeCacheData = Optional.of(new DwrfStripeCacheData(dwrfStripeCacheSlice, dwrfStripeCacheSize, stripeCacheMode));
    }
    return new OrcFileTail(hiveWriterVersion, bufferSize, compressionKind, footerSlice, footerSize, metadataSlice, metadataSize, dwrfStripeCacheData);
}
Also used : CompressionKind(com.facebook.presto.orc.metadata.CompressionKind) OrcFileTail(com.facebook.presto.orc.metadata.OrcFileTail) PostScript(com.facebook.presto.orc.metadata.PostScript) DwrfStripeCacheMode(com.facebook.presto.orc.metadata.DwrfStripeCacheMode) Slice(io.airlift.slice.Slice) DwrfStripeCacheData(com.facebook.presto.orc.metadata.DwrfStripeCacheData) OrcCorruptionException(com.facebook.presto.orc.OrcCorruptionException)

Aggregations

DwrfStripeCacheData (com.facebook.presto.orc.metadata.DwrfStripeCacheData)3 CompressionKind (com.facebook.presto.orc.metadata.CompressionKind)2 OrcFileTail (com.facebook.presto.orc.metadata.OrcFileTail)2 DwrfProto (com.facebook.presto.orc.proto.DwrfProto)2 Page (com.facebook.presto.common.Page)1 DataOutput (com.facebook.presto.common.io.DataOutput)1 DataOutput.createDataOutput (com.facebook.presto.common.io.DataOutput.createDataOutput)1 DataSink (com.facebook.presto.common.io.DataSink)1 Type (com.facebook.presto.common.type.Type)1 UNENCRYPTED (com.facebook.presto.orc.DwrfEncryptionInfo.UNENCRYPTED)1 DwrfEncryptionInfo.createNodeToGroupMap (com.facebook.presto.orc.DwrfEncryptionInfo.createNodeToGroupMap)1 CLOSED (com.facebook.presto.orc.FlushReason.CLOSED)1 OrcCorruptionException (com.facebook.presto.orc.OrcCorruptionException)1 DWRF (com.facebook.presto.orc.OrcEncoding.DWRF)1 OrcReader.validateFile (com.facebook.presto.orc.OrcReader.validateFile)1 OrcWriteValidationBuilder (com.facebook.presto.orc.OrcWriteValidation.OrcWriteValidationBuilder)1 OrcWriteValidationMode (com.facebook.presto.orc.OrcWriteValidation.OrcWriteValidationMode)1 StorageOrcFileTailSource (com.facebook.presto.orc.cache.StorageOrcFileTailSource)1 ColumnEncoding (com.facebook.presto.orc.metadata.ColumnEncoding)1 DIRECT (com.facebook.presto.orc.metadata.ColumnEncoding.ColumnEncodingKind.DIRECT)1