Search in sources :

Example 1 with PostScript

use of com.facebook.presto.orc.metadata.PostScript in project presto by prestodb.

the class StorageOrcFileTailSource method getOrcFileTail.

@Override
public OrcFileTail getOrcFileTail(OrcDataSource orcDataSource, MetadataReader metadataReader, Optional<OrcWriteValidation> writeValidation, boolean cacheable) throws IOException {
    long size = orcDataSource.getSize();
    if (size <= MAGIC.length()) {
        throw new OrcCorruptionException(orcDataSource.getId(), "Invalid file size %s", size);
    }
    // Read the tail of the file
    byte[] buffer = new byte[toIntExact(min(size, expectedFooterSizeInBytes))];
    orcDataSource.readFully(size - buffer.length, buffer);
    // get length of PostScript - last byte of the file
    int postScriptSize = buffer[buffer.length - SIZE_OF_BYTE] & 0xff;
    if (postScriptSize >= buffer.length) {
        throw new OrcCorruptionException(orcDataSource.getId(), "Invalid postscript length %s", postScriptSize);
    }
    // decode the post script
    PostScript postScript;
    try {
        postScript = metadataReader.readPostScript(buffer, buffer.length - SIZE_OF_BYTE - postScriptSize, postScriptSize);
    } catch (OrcCorruptionException e) {
        // check if this is an ORC file and not an RCFile or something else
        if (!isValidHeaderMagic(orcDataSource)) {
            throw new OrcCorruptionException(orcDataSource.getId(), "Not an ORC file");
        }
        throw e;
    }
    // verify this is a supported version
    checkOrcVersion(orcDataSource, postScript.getVersion());
    validateWrite(writeValidation, orcDataSource, validation -> validation.getVersion().equals(postScript.getVersion()), "Unexpected version");
    int bufferSize = toIntExact(postScript.getCompressionBlockSize());
    // check compression codec is supported
    CompressionKind compressionKind = postScript.getCompression();
    validateWrite(writeValidation, orcDataSource, validation -> validation.getCompression() == compressionKind, "Unexpected compression");
    PostScript.HiveWriterVersion hiveWriterVersion = postScript.getHiveWriterVersion();
    int footerSize = toIntExact(postScript.getFooterLength());
    int metadataSize = toIntExact(postScript.getMetadataLength());
    if (footerSize < 0) {
        throw new OrcCorruptionException(orcDataSource.getId(), "Invalid footer length %s", footerSize);
    }
    if (metadataSize < 0) {
        throw new OrcCorruptionException(orcDataSource.getId(), "Invalid metadata length %s", metadataSize);
    }
    // read DWRF stripe cache only if this feature is enabled and it has meaningful data
    boolean readDwrfStripeCache = dwrfStripeCacheEnabled && postScript.getDwrfStripeCacheLength().isPresent() && postScript.getDwrfStripeCacheMode().isPresent() && postScript.getDwrfStripeCacheMode().get() != DwrfStripeCacheMode.NONE;
    int dwrfStripeCacheSize = 0;
    if (readDwrfStripeCache) {
        dwrfStripeCacheSize = postScript.getDwrfStripeCacheLength().getAsInt();
        checkSizes(orcDataSource, metadataSize, dwrfStripeCacheSize);
    }
    // check if extra bytes need to be read
    Slice completeFooterSlice;
    int completeFooterSize = dwrfStripeCacheSize + metadataSize + footerSize + postScriptSize + SIZE_OF_BYTE;
    if (completeFooterSize > buffer.length) {
        // allocate a new buffer large enough for the complete footer
        byte[] newBuffer = new byte[completeFooterSize];
        completeFooterSlice = Slices.wrappedBuffer(newBuffer);
        // initial read was not large enough, so read missing section
        orcDataSource.readFully(size - completeFooterSize, newBuffer, 0, completeFooterSize - buffer.length);
        // copy already read bytes into the new buffer
        completeFooterSlice.setBytes(completeFooterSize - buffer.length, buffer);
    } else {
        // footer is already in the bytes in buffer, just adjust position, length
        completeFooterSlice = Slices.wrappedBuffer(buffer, buffer.length - completeFooterSize, completeFooterSize);
    }
    // metadataSize is set only for ORC files, dwrfStripeCacheSize is set only for DWRF files
    // it should be safe to sum them up to find footer offset
    // TAIL: [ ORC_METADATA{0,1} | DWRF_STRIPE_CACHE {0,1} ] + FOOTER + POST_SCRIPT + POST_SCRIPT_SIZE (1 byte)
    int footerSliceOffset = metadataSize + dwrfStripeCacheSize;
    Slice footerSlice = completeFooterSlice.slice(footerSliceOffset, footerSize);
    Slice metadataSlice = completeFooterSlice.slice(0, metadataSize);
    // set DwrfStripeCacheData only if the stripe cache feature is enabled and the file has the stripe cache
    Optional<DwrfStripeCacheData> dwrfStripeCacheData = Optional.empty();
    if (readDwrfStripeCache) {
        Slice dwrfStripeCacheSlice = completeFooterSlice.slice(0, dwrfStripeCacheSize);
        DwrfStripeCacheMode stripeCacheMode = postScript.getDwrfStripeCacheMode().get();
        dwrfStripeCacheData = Optional.of(new DwrfStripeCacheData(dwrfStripeCacheSlice, dwrfStripeCacheSize, stripeCacheMode));
    }
    return new OrcFileTail(hiveWriterVersion, bufferSize, compressionKind, footerSlice, footerSize, metadataSlice, metadataSize, dwrfStripeCacheData);
}
Also used : CompressionKind(com.facebook.presto.orc.metadata.CompressionKind) OrcFileTail(com.facebook.presto.orc.metadata.OrcFileTail) PostScript(com.facebook.presto.orc.metadata.PostScript) DwrfStripeCacheMode(com.facebook.presto.orc.metadata.DwrfStripeCacheMode) Slice(io.airlift.slice.Slice) DwrfStripeCacheData(com.facebook.presto.orc.metadata.DwrfStripeCacheData) OrcCorruptionException(com.facebook.presto.orc.OrcCorruptionException)

Aggregations

OrcCorruptionException (com.facebook.presto.orc.OrcCorruptionException)1 CompressionKind (com.facebook.presto.orc.metadata.CompressionKind)1 DwrfStripeCacheData (com.facebook.presto.orc.metadata.DwrfStripeCacheData)1 DwrfStripeCacheMode (com.facebook.presto.orc.metadata.DwrfStripeCacheMode)1 OrcFileTail (com.facebook.presto.orc.metadata.OrcFileTail)1 PostScript (com.facebook.presto.orc.metadata.PostScript)1 Slice (io.airlift.slice.Slice)1