use of com.facebook.presto.orc.metadata.DwrfStripeCacheData in project presto by prestodb.
the class OrcWriter method bufferFileFooter.
/**
* Collect the data for for the file footer. This is not the actual data, but
* instead are functions that know how to write the data.
*/
private List<DataOutput> bufferFileFooter() throws IOException {
List<DataOutput> outputData = new ArrayList<>();
Metadata metadata = new Metadata(closedStripes.stream().map(ClosedStripe::getStatistics).collect(toList()));
Slice metadataSlice = metadataWriter.writeMetadata(metadata);
outputData.add(createDataOutput(metadataSlice));
numberOfRows = closedStripes.stream().mapToLong(stripe -> stripe.getStripeInformation().getNumberOfRows()).sum();
List<ColumnStatistics> fileStats = toFileStats(closedStripes.stream().map(ClosedStripe::getStatistics).map(StripeStatistics::getColumnStatistics).collect(toList()));
recordValidation(validation -> validation.setFileStatistics(fileStats));
Map<String, Slice> userMetadata = this.userMetadata.entrySet().stream().collect(Collectors.toMap(Entry::getKey, entry -> utf8Slice(entry.getValue())));
unencryptedStats = new ArrayList<>();
Map<Integer, Map<Integer, Slice>> encryptedStats = new HashMap<>();
addStatsRecursive(fileStats, 0, new HashMap<>(), unencryptedStats, encryptedStats);
Optional<DwrfEncryption> dwrfEncryption;
if (dwrfWriterEncryption.isPresent()) {
ImmutableList.Builder<EncryptionGroup> encryptionGroupBuilder = ImmutableList.builder();
List<WriterEncryptionGroup> writerEncryptionGroups = dwrfWriterEncryption.get().getWriterEncryptionGroups();
for (int i = 0; i < writerEncryptionGroups.size(); i++) {
WriterEncryptionGroup group = writerEncryptionGroups.get(i);
Map<Integer, Slice> groupStats = encryptedStats.get(i);
encryptionGroupBuilder.add(new EncryptionGroup(group.getNodes(), // reader will just use key metadata from the stripe
Optional.empty(), group.getNodes().stream().map(groupStats::get).collect(toList())));
}
dwrfEncryption = Optional.of(new DwrfEncryption(dwrfWriterEncryption.get().getKeyProvider(), encryptionGroupBuilder.build()));
} else {
dwrfEncryption = Optional.empty();
}
Optional<DwrfStripeCacheData> dwrfStripeCacheData = dwrfStripeCacheWriter.map(DwrfStripeCacheWriter::getDwrfStripeCacheData);
Slice dwrfStripeCacheSlice = metadataWriter.writeDwrfStripeCache(dwrfStripeCacheData);
outputData.add(createDataOutput(dwrfStripeCacheSlice));
Optional<List<Integer>> dwrfStripeCacheOffsets = dwrfStripeCacheWriter.map(DwrfStripeCacheWriter::getOffsets);
Footer footer = new Footer(numberOfRows, rowGroupMaxRowCount, OptionalLong.of(rawSize), closedStripes.stream().map(ClosedStripe::getStripeInformation).collect(toList()), orcTypes, ImmutableList.copyOf(unencryptedStats), userMetadata, dwrfEncryption, dwrfStripeCacheOffsets);
closedStripes.clear();
closedStripesRetainedBytes = 0;
Slice footerSlice = metadataWriter.writeFooter(footer);
outputData.add(createDataOutput(footerSlice));
recordValidation(validation -> validation.setVersion(metadataWriter.getOrcMetadataVersion()));
Slice postscriptSlice = metadataWriter.writePostscript(footerSlice.length(), metadataSlice.length(), columnWriterOptions.getCompressionKind(), columnWriterOptions.getCompressionMaxBufferSize(), dwrfStripeCacheData);
outputData.add(createDataOutput(postscriptSlice));
outputData.add(createDataOutput(Slices.wrappedBuffer((byte) postscriptSlice.length())));
return outputData;
}
use of com.facebook.presto.orc.metadata.DwrfStripeCacheData in project presto by prestodb.
the class TestStorageOrcFileTailSource method testReadDwrfStripeCacheIfEnabled.
@Test
public void testReadDwrfStripeCacheIfEnabled() throws IOException {
FileOutputStream out = new FileOutputStream(file.getFile());
// write a fake stripe cache
byte[] stripeCache = new byte[100];
for (int i = 0; i < stripeCache.length; i++) {
stripeCache[i] = (byte) i;
}
out.write(stripeCache);
// write the footer and post script
DwrfProto.Footer.Builder footer = DwrfProto.Footer.newBuilder().addAllStripeCacheOffsets(ImmutableList.of(1, 2, 3));
DwrfProto.PostScript.Builder postScript = DwrfProto.PostScript.newBuilder().setCompression(NONE).setCacheMode(BOTH).setCacheSize(stripeCache.length);
writeTail(footer, postScript, out);
out.close();
// read the file tail with the enabled "read dwrf stripe cache" feature
StorageOrcFileTailSource src = new StorageOrcFileTailSource(FOOTER_READ_SIZE_IN_BYTES, true);
OrcDataSource orcDataSource = createFileOrcDataSource();
OrcFileTail orcFileTail = src.getOrcFileTail(orcDataSource, metadataReader, Optional.empty(), false);
assertEquals(orcFileTail.getMetadataSize(), 0);
DwrfProto.Footer actualFooter = readFooter(orcFileTail);
assertEquals(actualFooter, footer.build());
// make sure the stripe cache is loaded correctly
assertTrue(orcFileTail.getDwrfStripeCacheData().isPresent());
DwrfStripeCacheData dwrfStripeCacheData = orcFileTail.getDwrfStripeCacheData().get();
assertEquals(dwrfStripeCacheData.getDwrfStripeCacheMode(), INDEX_AND_FOOTER);
assertEquals(dwrfStripeCacheData.getDwrfStripeCacheSize(), stripeCache.length);
assertEquals(dwrfStripeCacheData.getDwrfStripeCacheSlice().getBytes(), stripeCache);
}
use of com.facebook.presto.orc.metadata.DwrfStripeCacheData in project presto by prestodb.
the class StorageOrcFileTailSource method getOrcFileTail.
@Override
public OrcFileTail getOrcFileTail(OrcDataSource orcDataSource, MetadataReader metadataReader, Optional<OrcWriteValidation> writeValidation, boolean cacheable) throws IOException {
long size = orcDataSource.getSize();
if (size <= MAGIC.length()) {
throw new OrcCorruptionException(orcDataSource.getId(), "Invalid file size %s", size);
}
// Read the tail of the file
byte[] buffer = new byte[toIntExact(min(size, expectedFooterSizeInBytes))];
orcDataSource.readFully(size - buffer.length, buffer);
// get length of PostScript - last byte of the file
int postScriptSize = buffer[buffer.length - SIZE_OF_BYTE] & 0xff;
if (postScriptSize >= buffer.length) {
throw new OrcCorruptionException(orcDataSource.getId(), "Invalid postscript length %s", postScriptSize);
}
// decode the post script
PostScript postScript;
try {
postScript = metadataReader.readPostScript(buffer, buffer.length - SIZE_OF_BYTE - postScriptSize, postScriptSize);
} catch (OrcCorruptionException e) {
// check if this is an ORC file and not an RCFile or something else
if (!isValidHeaderMagic(orcDataSource)) {
throw new OrcCorruptionException(orcDataSource.getId(), "Not an ORC file");
}
throw e;
}
// verify this is a supported version
checkOrcVersion(orcDataSource, postScript.getVersion());
validateWrite(writeValidation, orcDataSource, validation -> validation.getVersion().equals(postScript.getVersion()), "Unexpected version");
int bufferSize = toIntExact(postScript.getCompressionBlockSize());
// check compression codec is supported
CompressionKind compressionKind = postScript.getCompression();
validateWrite(writeValidation, orcDataSource, validation -> validation.getCompression() == compressionKind, "Unexpected compression");
PostScript.HiveWriterVersion hiveWriterVersion = postScript.getHiveWriterVersion();
int footerSize = toIntExact(postScript.getFooterLength());
int metadataSize = toIntExact(postScript.getMetadataLength());
if (footerSize < 0) {
throw new OrcCorruptionException(orcDataSource.getId(), "Invalid footer length %s", footerSize);
}
if (metadataSize < 0) {
throw new OrcCorruptionException(orcDataSource.getId(), "Invalid metadata length %s", metadataSize);
}
// read DWRF stripe cache only if this feature is enabled and it has meaningful data
boolean readDwrfStripeCache = dwrfStripeCacheEnabled && postScript.getDwrfStripeCacheLength().isPresent() && postScript.getDwrfStripeCacheMode().isPresent() && postScript.getDwrfStripeCacheMode().get() != DwrfStripeCacheMode.NONE;
int dwrfStripeCacheSize = 0;
if (readDwrfStripeCache) {
dwrfStripeCacheSize = postScript.getDwrfStripeCacheLength().getAsInt();
checkSizes(orcDataSource, metadataSize, dwrfStripeCacheSize);
}
// check if extra bytes need to be read
Slice completeFooterSlice;
int completeFooterSize = dwrfStripeCacheSize + metadataSize + footerSize + postScriptSize + SIZE_OF_BYTE;
if (completeFooterSize > buffer.length) {
// allocate a new buffer large enough for the complete footer
byte[] newBuffer = new byte[completeFooterSize];
completeFooterSlice = Slices.wrappedBuffer(newBuffer);
// initial read was not large enough, so read missing section
orcDataSource.readFully(size - completeFooterSize, newBuffer, 0, completeFooterSize - buffer.length);
// copy already read bytes into the new buffer
completeFooterSlice.setBytes(completeFooterSize - buffer.length, buffer);
} else {
// footer is already in the bytes in buffer, just adjust position, length
completeFooterSlice = Slices.wrappedBuffer(buffer, buffer.length - completeFooterSize, completeFooterSize);
}
// metadataSize is set only for ORC files, dwrfStripeCacheSize is set only for DWRF files
// it should be safe to sum them up to find footer offset
// TAIL: [ ORC_METADATA{0,1} | DWRF_STRIPE_CACHE {0,1} ] + FOOTER + POST_SCRIPT + POST_SCRIPT_SIZE (1 byte)
int footerSliceOffset = metadataSize + dwrfStripeCacheSize;
Slice footerSlice = completeFooterSlice.slice(footerSliceOffset, footerSize);
Slice metadataSlice = completeFooterSlice.slice(0, metadataSize);
// set DwrfStripeCacheData only if the stripe cache feature is enabled and the file has the stripe cache
Optional<DwrfStripeCacheData> dwrfStripeCacheData = Optional.empty();
if (readDwrfStripeCache) {
Slice dwrfStripeCacheSlice = completeFooterSlice.slice(0, dwrfStripeCacheSize);
DwrfStripeCacheMode stripeCacheMode = postScript.getDwrfStripeCacheMode().get();
dwrfStripeCacheData = Optional.of(new DwrfStripeCacheData(dwrfStripeCacheSlice, dwrfStripeCacheSize, stripeCacheMode));
}
return new OrcFileTail(hiveWriterVersion, bufferSize, compressionKind, footerSlice, footerSize, metadataSlice, metadataSize, dwrfStripeCacheData);
}
Aggregations