use of com.facebook.presto.orc.metadata.DwrfStripeCacheMode in project presto by prestodb.
the class TestOrcWriterOptions method testToString.
@Test
public void testToString() {
DataSize stripeMinSize = new DataSize(13, MEGABYTE);
DataSize stripeMaxSize = new DataSize(27, MEGABYTE);
int stripeMaxRowCount = 1_100_000;
int rowGroupMaxRowCount = 15_000;
DataSize dictionaryMaxMemory = new DataSize(13_000, KILOBYTE);
DataSize dictionaryMemoryRange = new DataSize(1_000, KILOBYTE);
int dictionaryUsefulCheckPerChunkFrequency = 9_999;
DataSize dictionaryUsefulCheckColumnSize = new DataSize(1, MEGABYTE);
DataSize stringMaxStatisticsLimit = new DataSize(128, BYTE);
DataSize maxCompressionBufferSize = new DataSize(512, KILOBYTE);
DataSize dwrfStripeCacheMaxSize = new DataSize(4, MEGABYTE);
DwrfStripeCacheMode dwrfStripeCacheMode = DwrfStripeCacheMode.INDEX_AND_FOOTER;
OptionalInt compressionLevel = OptionalInt.of(5);
StreamLayoutFactory streamLayoutFactory = new ColumnSizeLayoutFactory();
boolean integerDictionaryEncodingEnabled = false;
boolean stringDictionarySortingEnabled = true;
int preserveDirectEncodingStripeCount = 0;
OrcWriterOptions writerOptions = OrcWriterOptions.builder().withFlushPolicy(DefaultOrcWriterFlushPolicy.builder().withStripeMinSize(stripeMinSize).withStripeMaxSize(stripeMaxSize).withStripeMaxRowCount(stripeMaxRowCount).build()).withRowGroupMaxRowCount(rowGroupMaxRowCount).withDictionaryMaxMemory(dictionaryMaxMemory).withDictionaryMemoryAlmostFullRange(dictionaryMemoryRange).withDictionaryUsefulCheckPerChunkFrequency(dictionaryUsefulCheckPerChunkFrequency).withDictionaryUsefulCheckColumnSize(dictionaryUsefulCheckColumnSize).withMaxStringStatisticsLimit(stringMaxStatisticsLimit).withMaxCompressionBufferSize(maxCompressionBufferSize).withCompressionLevel(compressionLevel).withStreamLayoutFactory(streamLayoutFactory).withIntegerDictionaryEncodingEnabled(integerDictionaryEncodingEnabled).withStringDictionarySortingEnabled(stringDictionarySortingEnabled).withDwrfStripeCacheEnabled(true).withDwrfStripeCacheMaxSize(dwrfStripeCacheMaxSize).withDwrfStripeCacheMode(dwrfStripeCacheMode).withPreserveDirectEncodingStripeCount(preserveDirectEncodingStripeCount).build();
String expectedString = "OrcWriterOptions{flushPolicy=DefaultOrcWriterFlushPolicy{stripeMaxRowCount=1100000, " + "stripeMinBytes=13631488, stripeMaxBytes=28311552}, rowGroupMaxRowCount=15000, " + "dictionaryMaxMemory=13000kB, dictionaryMemoryAlmostFullRange=1000kB, dictionaryUsefulCheckPerChunkFrequency=9999, " + "dictionaryUsefulCheckColumnSize=1MB, maxStringStatisticsLimit=128B, maxCompressionBufferSize=512kB, " + "compressionLevel=OptionalInt[5], streamLayoutFactory=ColumnSizeLayoutFactory{}, integerDictionaryEncodingEnabled=false, " + "stringDictionarySortingEnabled=true, stringDictionaryEncodingEnabled=true, " + "dwrfWriterOptions=Optional[DwrfStripeCacheOptions{stripeCacheMode=INDEX_AND_FOOTER, stripeCacheMaxSize=4MB}], " + "ignoreDictionaryRowGroupSizes=false, preserveDirectEncodingStripeCount=0}";
assertEquals(expectedString, writerOptions.toString());
}
use of com.facebook.presto.orc.metadata.DwrfStripeCacheMode in project presto by prestodb.
the class TestOrcFileWriterConfig method testOrcWriterOptionsBuilder.
@Test
public void testOrcWriterOptionsBuilder() {
DataSize stripeMinSize = new DataSize(10, MEGABYTE);
DataSize stripeMaxSize = new DataSize(50, MEGABYTE);
int stripeMaxRowCount = 1_000_000;
int rowGroupMaxRowCount = 15_000;
DataSize dictionaryMaxMemory = new DataSize(20, MEGABYTE);
DataSize stringStatisticsLimit = new DataSize(32, BYTE);
DataSize maxCompressionBufferSize = new DataSize(512, KILOBYTE);
StreamLayoutType streamLayoutType = BY_STREAM_SIZE;
DataSize dwrfStripeCacheMaxSize = new DataSize(4, MEGABYTE);
DwrfStripeCacheMode dwrfStripeCacheMode = INDEX;
OrcFileWriterConfig config = new OrcFileWriterConfig().setStripeMinSize(stripeMinSize).setStripeMaxSize(stripeMaxSize).setStripeMaxRowCount(stripeMaxRowCount).setRowGroupMaxRowCount(rowGroupMaxRowCount).setDictionaryMaxMemory(dictionaryMaxMemory).setStringStatisticsLimit(stringStatisticsLimit).setMaxCompressionBufferSize(maxCompressionBufferSize).setStreamLayoutType(streamLayoutType).setDwrfStripeCacheEnabled(false).setDwrfStripeCacheMaxSize(dwrfStripeCacheMaxSize).setDwrfStripeCacheMode(dwrfStripeCacheMode);
assertEquals(stripeMinSize, config.getStripeMinSize());
assertEquals(stripeMaxSize, config.getStripeMaxSize());
assertEquals(stripeMaxRowCount, config.getStripeMaxRowCount());
assertEquals(rowGroupMaxRowCount, config.getRowGroupMaxRowCount());
assertEquals(dictionaryMaxMemory, config.getDictionaryMaxMemory());
assertEquals(stringStatisticsLimit, config.getStringStatisticsLimit());
assertEquals(maxCompressionBufferSize, config.getMaxCompressionBufferSize());
assertEquals(streamLayoutType, config.getStreamLayoutType());
assertFalse(config.isDwrfStripeCacheEnabled());
assertEquals(dwrfStripeCacheMaxSize, config.getDwrfStripeCacheMaxSize());
assertEquals(dwrfStripeCacheMode, config.getDwrfStripeCacheMode());
assertNotSame(config.toOrcWriterOptionsBuilder(), config.toOrcWriterOptionsBuilder());
OrcWriterOptions options = config.toOrcWriterOptionsBuilder().build();
assertEquals(toIntExact(stripeMinSize.toBytes()), options.getFlushPolicy().getStripeMinBytes());
assertEquals(toIntExact(stripeMaxSize.toBytes()), options.getFlushPolicy().getStripeMaxBytes());
assertEquals(stripeMaxRowCount, options.getFlushPolicy().getStripeMaxRowCount());
assertEquals(rowGroupMaxRowCount, options.getRowGroupMaxRowCount());
assertEquals(dictionaryMaxMemory, options.getDictionaryMaxMemory());
assertEquals(stringStatisticsLimit, options.getMaxStringStatisticsLimit());
assertEquals(maxCompressionBufferSize, options.getMaxCompressionBufferSize());
assertTrue(options.getStreamLayoutFactory() instanceof StreamSizeLayoutFactory);
assertEquals(Optional.empty(), options.getDwrfStripeCacheOptions());
}
use of com.facebook.presto.orc.metadata.DwrfStripeCacheMode in project presto by prestodb.
the class StorageOrcFileTailSource method getOrcFileTail.
@Override
public OrcFileTail getOrcFileTail(OrcDataSource orcDataSource, MetadataReader metadataReader, Optional<OrcWriteValidation> writeValidation, boolean cacheable) throws IOException {
long size = orcDataSource.getSize();
if (size <= MAGIC.length()) {
throw new OrcCorruptionException(orcDataSource.getId(), "Invalid file size %s", size);
}
// Read the tail of the file
byte[] buffer = new byte[toIntExact(min(size, expectedFooterSizeInBytes))];
orcDataSource.readFully(size - buffer.length, buffer);
// get length of PostScript - last byte of the file
int postScriptSize = buffer[buffer.length - SIZE_OF_BYTE] & 0xff;
if (postScriptSize >= buffer.length) {
throw new OrcCorruptionException(orcDataSource.getId(), "Invalid postscript length %s", postScriptSize);
}
// decode the post script
PostScript postScript;
try {
postScript = metadataReader.readPostScript(buffer, buffer.length - SIZE_OF_BYTE - postScriptSize, postScriptSize);
} catch (OrcCorruptionException e) {
// check if this is an ORC file and not an RCFile or something else
if (!isValidHeaderMagic(orcDataSource)) {
throw new OrcCorruptionException(orcDataSource.getId(), "Not an ORC file");
}
throw e;
}
// verify this is a supported version
checkOrcVersion(orcDataSource, postScript.getVersion());
validateWrite(writeValidation, orcDataSource, validation -> validation.getVersion().equals(postScript.getVersion()), "Unexpected version");
int bufferSize = toIntExact(postScript.getCompressionBlockSize());
// check compression codec is supported
CompressionKind compressionKind = postScript.getCompression();
validateWrite(writeValidation, orcDataSource, validation -> validation.getCompression() == compressionKind, "Unexpected compression");
PostScript.HiveWriterVersion hiveWriterVersion = postScript.getHiveWriterVersion();
int footerSize = toIntExact(postScript.getFooterLength());
int metadataSize = toIntExact(postScript.getMetadataLength());
if (footerSize < 0) {
throw new OrcCorruptionException(orcDataSource.getId(), "Invalid footer length %s", footerSize);
}
if (metadataSize < 0) {
throw new OrcCorruptionException(orcDataSource.getId(), "Invalid metadata length %s", metadataSize);
}
// read DWRF stripe cache only if this feature is enabled and it has meaningful data
boolean readDwrfStripeCache = dwrfStripeCacheEnabled && postScript.getDwrfStripeCacheLength().isPresent() && postScript.getDwrfStripeCacheMode().isPresent() && postScript.getDwrfStripeCacheMode().get() != DwrfStripeCacheMode.NONE;
int dwrfStripeCacheSize = 0;
if (readDwrfStripeCache) {
dwrfStripeCacheSize = postScript.getDwrfStripeCacheLength().getAsInt();
checkSizes(orcDataSource, metadataSize, dwrfStripeCacheSize);
}
// check if extra bytes need to be read
Slice completeFooterSlice;
int completeFooterSize = dwrfStripeCacheSize + metadataSize + footerSize + postScriptSize + SIZE_OF_BYTE;
if (completeFooterSize > buffer.length) {
// allocate a new buffer large enough for the complete footer
byte[] newBuffer = new byte[completeFooterSize];
completeFooterSlice = Slices.wrappedBuffer(newBuffer);
// initial read was not large enough, so read missing section
orcDataSource.readFully(size - completeFooterSize, newBuffer, 0, completeFooterSize - buffer.length);
// copy already read bytes into the new buffer
completeFooterSlice.setBytes(completeFooterSize - buffer.length, buffer);
} else {
// footer is already in the bytes in buffer, just adjust position, length
completeFooterSlice = Slices.wrappedBuffer(buffer, buffer.length - completeFooterSize, completeFooterSize);
}
// metadataSize is set only for ORC files, dwrfStripeCacheSize is set only for DWRF files
// it should be safe to sum them up to find footer offset
// TAIL: [ ORC_METADATA{0,1} | DWRF_STRIPE_CACHE {0,1} ] + FOOTER + POST_SCRIPT + POST_SCRIPT_SIZE (1 byte)
int footerSliceOffset = metadataSize + dwrfStripeCacheSize;
Slice footerSlice = completeFooterSlice.slice(footerSliceOffset, footerSize);
Slice metadataSlice = completeFooterSlice.slice(0, metadataSize);
// set DwrfStripeCacheData only if the stripe cache feature is enabled and the file has the stripe cache
Optional<DwrfStripeCacheData> dwrfStripeCacheData = Optional.empty();
if (readDwrfStripeCache) {
Slice dwrfStripeCacheSlice = completeFooterSlice.slice(0, dwrfStripeCacheSize);
DwrfStripeCacheMode stripeCacheMode = postScript.getDwrfStripeCacheMode().get();
dwrfStripeCacheData = Optional.of(new DwrfStripeCacheData(dwrfStripeCacheSlice, dwrfStripeCacheSize, stripeCacheMode));
}
return new OrcFileTail(hiveWriterVersion, bufferSize, compressionKind, footerSlice, footerSize, metadataSlice, metadataSize, dwrfStripeCacheData);
}
Aggregations