Search in sources :

Example 1 with DwrfStripeCacheMode

use of com.facebook.presto.orc.metadata.DwrfStripeCacheMode in project presto by prestodb.

the class TestOrcWriterOptions method testToString.

@Test
public void testToString() {
    DataSize stripeMinSize = new DataSize(13, MEGABYTE);
    DataSize stripeMaxSize = new DataSize(27, MEGABYTE);
    int stripeMaxRowCount = 1_100_000;
    int rowGroupMaxRowCount = 15_000;
    DataSize dictionaryMaxMemory = new DataSize(13_000, KILOBYTE);
    DataSize dictionaryMemoryRange = new DataSize(1_000, KILOBYTE);
    int dictionaryUsefulCheckPerChunkFrequency = 9_999;
    DataSize dictionaryUsefulCheckColumnSize = new DataSize(1, MEGABYTE);
    DataSize stringMaxStatisticsLimit = new DataSize(128, BYTE);
    DataSize maxCompressionBufferSize = new DataSize(512, KILOBYTE);
    DataSize dwrfStripeCacheMaxSize = new DataSize(4, MEGABYTE);
    DwrfStripeCacheMode dwrfStripeCacheMode = DwrfStripeCacheMode.INDEX_AND_FOOTER;
    OptionalInt compressionLevel = OptionalInt.of(5);
    StreamLayoutFactory streamLayoutFactory = new ColumnSizeLayoutFactory();
    boolean integerDictionaryEncodingEnabled = false;
    boolean stringDictionarySortingEnabled = true;
    int preserveDirectEncodingStripeCount = 0;
    OrcWriterOptions writerOptions = OrcWriterOptions.builder().withFlushPolicy(DefaultOrcWriterFlushPolicy.builder().withStripeMinSize(stripeMinSize).withStripeMaxSize(stripeMaxSize).withStripeMaxRowCount(stripeMaxRowCount).build()).withRowGroupMaxRowCount(rowGroupMaxRowCount).withDictionaryMaxMemory(dictionaryMaxMemory).withDictionaryMemoryAlmostFullRange(dictionaryMemoryRange).withDictionaryUsefulCheckPerChunkFrequency(dictionaryUsefulCheckPerChunkFrequency).withDictionaryUsefulCheckColumnSize(dictionaryUsefulCheckColumnSize).withMaxStringStatisticsLimit(stringMaxStatisticsLimit).withMaxCompressionBufferSize(maxCompressionBufferSize).withCompressionLevel(compressionLevel).withStreamLayoutFactory(streamLayoutFactory).withIntegerDictionaryEncodingEnabled(integerDictionaryEncodingEnabled).withStringDictionarySortingEnabled(stringDictionarySortingEnabled).withDwrfStripeCacheEnabled(true).withDwrfStripeCacheMaxSize(dwrfStripeCacheMaxSize).withDwrfStripeCacheMode(dwrfStripeCacheMode).withPreserveDirectEncodingStripeCount(preserveDirectEncodingStripeCount).build();
    String expectedString = "OrcWriterOptions{flushPolicy=DefaultOrcWriterFlushPolicy{stripeMaxRowCount=1100000, " + "stripeMinBytes=13631488, stripeMaxBytes=28311552}, rowGroupMaxRowCount=15000, " + "dictionaryMaxMemory=13000kB, dictionaryMemoryAlmostFullRange=1000kB, dictionaryUsefulCheckPerChunkFrequency=9999, " + "dictionaryUsefulCheckColumnSize=1MB, maxStringStatisticsLimit=128B, maxCompressionBufferSize=512kB, " + "compressionLevel=OptionalInt[5], streamLayoutFactory=ColumnSizeLayoutFactory{}, integerDictionaryEncodingEnabled=false, " + "stringDictionarySortingEnabled=true, stringDictionaryEncodingEnabled=true, " + "dwrfWriterOptions=Optional[DwrfStripeCacheOptions{stripeCacheMode=INDEX_AND_FOOTER, stripeCacheMaxSize=4MB}], " + "ignoreDictionaryRowGroupSizes=false, preserveDirectEncodingStripeCount=0}";
    assertEquals(expectedString, writerOptions.toString());
}
Also used : DwrfStripeCacheMode(com.facebook.presto.orc.metadata.DwrfStripeCacheMode) StreamLayoutFactory(com.facebook.presto.orc.writer.StreamLayoutFactory) DataSize(io.airlift.units.DataSize) ColumnSizeLayoutFactory(com.facebook.presto.orc.writer.StreamLayoutFactory.ColumnSizeLayoutFactory) OptionalInt(java.util.OptionalInt) Test(org.testng.annotations.Test)

Example 2 with DwrfStripeCacheMode

use of com.facebook.presto.orc.metadata.DwrfStripeCacheMode in project presto by prestodb.

the class TestOrcFileWriterConfig method testOrcWriterOptionsBuilder.

@Test
public void testOrcWriterOptionsBuilder() {
    DataSize stripeMinSize = new DataSize(10, MEGABYTE);
    DataSize stripeMaxSize = new DataSize(50, MEGABYTE);
    int stripeMaxRowCount = 1_000_000;
    int rowGroupMaxRowCount = 15_000;
    DataSize dictionaryMaxMemory = new DataSize(20, MEGABYTE);
    DataSize stringStatisticsLimit = new DataSize(32, BYTE);
    DataSize maxCompressionBufferSize = new DataSize(512, KILOBYTE);
    StreamLayoutType streamLayoutType = BY_STREAM_SIZE;
    DataSize dwrfStripeCacheMaxSize = new DataSize(4, MEGABYTE);
    DwrfStripeCacheMode dwrfStripeCacheMode = INDEX;
    OrcFileWriterConfig config = new OrcFileWriterConfig().setStripeMinSize(stripeMinSize).setStripeMaxSize(stripeMaxSize).setStripeMaxRowCount(stripeMaxRowCount).setRowGroupMaxRowCount(rowGroupMaxRowCount).setDictionaryMaxMemory(dictionaryMaxMemory).setStringStatisticsLimit(stringStatisticsLimit).setMaxCompressionBufferSize(maxCompressionBufferSize).setStreamLayoutType(streamLayoutType).setDwrfStripeCacheEnabled(false).setDwrfStripeCacheMaxSize(dwrfStripeCacheMaxSize).setDwrfStripeCacheMode(dwrfStripeCacheMode);
    assertEquals(stripeMinSize, config.getStripeMinSize());
    assertEquals(stripeMaxSize, config.getStripeMaxSize());
    assertEquals(stripeMaxRowCount, config.getStripeMaxRowCount());
    assertEquals(rowGroupMaxRowCount, config.getRowGroupMaxRowCount());
    assertEquals(dictionaryMaxMemory, config.getDictionaryMaxMemory());
    assertEquals(stringStatisticsLimit, config.getStringStatisticsLimit());
    assertEquals(maxCompressionBufferSize, config.getMaxCompressionBufferSize());
    assertEquals(streamLayoutType, config.getStreamLayoutType());
    assertFalse(config.isDwrfStripeCacheEnabled());
    assertEquals(dwrfStripeCacheMaxSize, config.getDwrfStripeCacheMaxSize());
    assertEquals(dwrfStripeCacheMode, config.getDwrfStripeCacheMode());
    assertNotSame(config.toOrcWriterOptionsBuilder(), config.toOrcWriterOptionsBuilder());
    OrcWriterOptions options = config.toOrcWriterOptionsBuilder().build();
    assertEquals(toIntExact(stripeMinSize.toBytes()), options.getFlushPolicy().getStripeMinBytes());
    assertEquals(toIntExact(stripeMaxSize.toBytes()), options.getFlushPolicy().getStripeMaxBytes());
    assertEquals(stripeMaxRowCount, options.getFlushPolicy().getStripeMaxRowCount());
    assertEquals(rowGroupMaxRowCount, options.getRowGroupMaxRowCount());
    assertEquals(dictionaryMaxMemory, options.getDictionaryMaxMemory());
    assertEquals(stringStatisticsLimit, options.getMaxStringStatisticsLimit());
    assertEquals(maxCompressionBufferSize, options.getMaxCompressionBufferSize());
    assertTrue(options.getStreamLayoutFactory() instanceof StreamSizeLayoutFactory);
    assertEquals(Optional.empty(), options.getDwrfStripeCacheOptions());
}
Also used : OrcWriterOptions(com.facebook.presto.orc.OrcWriterOptions) DwrfStripeCacheMode(com.facebook.presto.orc.metadata.DwrfStripeCacheMode) DataSize(io.airlift.units.DataSize) StreamSizeLayoutFactory(com.facebook.presto.orc.writer.StreamLayoutFactory.StreamSizeLayoutFactory) StreamLayoutType(com.facebook.presto.hive.OrcFileWriterConfig.StreamLayoutType) Test(org.testng.annotations.Test)

Example 3 with DwrfStripeCacheMode

use of com.facebook.presto.orc.metadata.DwrfStripeCacheMode in project presto by prestodb.

the class StorageOrcFileTailSource method getOrcFileTail.

@Override
public OrcFileTail getOrcFileTail(OrcDataSource orcDataSource, MetadataReader metadataReader, Optional<OrcWriteValidation> writeValidation, boolean cacheable) throws IOException {
    long size = orcDataSource.getSize();
    if (size <= MAGIC.length()) {
        throw new OrcCorruptionException(orcDataSource.getId(), "Invalid file size %s", size);
    }
    // Read the tail of the file
    byte[] buffer = new byte[toIntExact(min(size, expectedFooterSizeInBytes))];
    orcDataSource.readFully(size - buffer.length, buffer);
    // get length of PostScript - last byte of the file
    int postScriptSize = buffer[buffer.length - SIZE_OF_BYTE] & 0xff;
    if (postScriptSize >= buffer.length) {
        throw new OrcCorruptionException(orcDataSource.getId(), "Invalid postscript length %s", postScriptSize);
    }
    // decode the post script
    PostScript postScript;
    try {
        postScript = metadataReader.readPostScript(buffer, buffer.length - SIZE_OF_BYTE - postScriptSize, postScriptSize);
    } catch (OrcCorruptionException e) {
        // check if this is an ORC file and not an RCFile or something else
        if (!isValidHeaderMagic(orcDataSource)) {
            throw new OrcCorruptionException(orcDataSource.getId(), "Not an ORC file");
        }
        throw e;
    }
    // verify this is a supported version
    checkOrcVersion(orcDataSource, postScript.getVersion());
    validateWrite(writeValidation, orcDataSource, validation -> validation.getVersion().equals(postScript.getVersion()), "Unexpected version");
    int bufferSize = toIntExact(postScript.getCompressionBlockSize());
    // check compression codec is supported
    CompressionKind compressionKind = postScript.getCompression();
    validateWrite(writeValidation, orcDataSource, validation -> validation.getCompression() == compressionKind, "Unexpected compression");
    PostScript.HiveWriterVersion hiveWriterVersion = postScript.getHiveWriterVersion();
    int footerSize = toIntExact(postScript.getFooterLength());
    int metadataSize = toIntExact(postScript.getMetadataLength());
    if (footerSize < 0) {
        throw new OrcCorruptionException(orcDataSource.getId(), "Invalid footer length %s", footerSize);
    }
    if (metadataSize < 0) {
        throw new OrcCorruptionException(orcDataSource.getId(), "Invalid metadata length %s", metadataSize);
    }
    // read DWRF stripe cache only if this feature is enabled and it has meaningful data
    boolean readDwrfStripeCache = dwrfStripeCacheEnabled && postScript.getDwrfStripeCacheLength().isPresent() && postScript.getDwrfStripeCacheMode().isPresent() && postScript.getDwrfStripeCacheMode().get() != DwrfStripeCacheMode.NONE;
    int dwrfStripeCacheSize = 0;
    if (readDwrfStripeCache) {
        dwrfStripeCacheSize = postScript.getDwrfStripeCacheLength().getAsInt();
        checkSizes(orcDataSource, metadataSize, dwrfStripeCacheSize);
    }
    // check if extra bytes need to be read
    Slice completeFooterSlice;
    int completeFooterSize = dwrfStripeCacheSize + metadataSize + footerSize + postScriptSize + SIZE_OF_BYTE;
    if (completeFooterSize > buffer.length) {
        // allocate a new buffer large enough for the complete footer
        byte[] newBuffer = new byte[completeFooterSize];
        completeFooterSlice = Slices.wrappedBuffer(newBuffer);
        // initial read was not large enough, so read missing section
        orcDataSource.readFully(size - completeFooterSize, newBuffer, 0, completeFooterSize - buffer.length);
        // copy already read bytes into the new buffer
        completeFooterSlice.setBytes(completeFooterSize - buffer.length, buffer);
    } else {
        // footer is already in the bytes in buffer, just adjust position, length
        completeFooterSlice = Slices.wrappedBuffer(buffer, buffer.length - completeFooterSize, completeFooterSize);
    }
    // metadataSize is set only for ORC files, dwrfStripeCacheSize is set only for DWRF files
    // it should be safe to sum them up to find footer offset
    // TAIL: [ ORC_METADATA{0,1} | DWRF_STRIPE_CACHE {0,1} ] + FOOTER + POST_SCRIPT + POST_SCRIPT_SIZE (1 byte)
    int footerSliceOffset = metadataSize + dwrfStripeCacheSize;
    Slice footerSlice = completeFooterSlice.slice(footerSliceOffset, footerSize);
    Slice metadataSlice = completeFooterSlice.slice(0, metadataSize);
    // set DwrfStripeCacheData only if the stripe cache feature is enabled and the file has the stripe cache
    Optional<DwrfStripeCacheData> dwrfStripeCacheData = Optional.empty();
    if (readDwrfStripeCache) {
        Slice dwrfStripeCacheSlice = completeFooterSlice.slice(0, dwrfStripeCacheSize);
        DwrfStripeCacheMode stripeCacheMode = postScript.getDwrfStripeCacheMode().get();
        dwrfStripeCacheData = Optional.of(new DwrfStripeCacheData(dwrfStripeCacheSlice, dwrfStripeCacheSize, stripeCacheMode));
    }
    return new OrcFileTail(hiveWriterVersion, bufferSize, compressionKind, footerSlice, footerSize, metadataSlice, metadataSize, dwrfStripeCacheData);
}
Also used : CompressionKind(com.facebook.presto.orc.metadata.CompressionKind) OrcFileTail(com.facebook.presto.orc.metadata.OrcFileTail) PostScript(com.facebook.presto.orc.metadata.PostScript) DwrfStripeCacheMode(com.facebook.presto.orc.metadata.DwrfStripeCacheMode) Slice(io.airlift.slice.Slice) DwrfStripeCacheData(com.facebook.presto.orc.metadata.DwrfStripeCacheData) OrcCorruptionException(com.facebook.presto.orc.OrcCorruptionException)

Aggregations

DwrfStripeCacheMode (com.facebook.presto.orc.metadata.DwrfStripeCacheMode)3 DataSize (io.airlift.units.DataSize)2 Test (org.testng.annotations.Test)2 StreamLayoutType (com.facebook.presto.hive.OrcFileWriterConfig.StreamLayoutType)1 OrcCorruptionException (com.facebook.presto.orc.OrcCorruptionException)1 OrcWriterOptions (com.facebook.presto.orc.OrcWriterOptions)1 CompressionKind (com.facebook.presto.orc.metadata.CompressionKind)1 DwrfStripeCacheData (com.facebook.presto.orc.metadata.DwrfStripeCacheData)1 OrcFileTail (com.facebook.presto.orc.metadata.OrcFileTail)1 PostScript (com.facebook.presto.orc.metadata.PostScript)1 StreamLayoutFactory (com.facebook.presto.orc.writer.StreamLayoutFactory)1 ColumnSizeLayoutFactory (com.facebook.presto.orc.writer.StreamLayoutFactory.ColumnSizeLayoutFactory)1 StreamSizeLayoutFactory (com.facebook.presto.orc.writer.StreamLayoutFactory.StreamSizeLayoutFactory)1 Slice (io.airlift.slice.Slice)1 OptionalInt (java.util.OptionalInt)1