use of io.prestosql.orc.metadata.MetadataReader in project hetu-core by openlookeng.
the class OrcFileTail method readFrom.
public static OrcFileTail readFrom(OrcDataSource orcDataSource, Optional<OrcWriteValidation> writeValidation) throws IOException {
OrcFileTail orcFileTail = new OrcFileTail();
//
// Read the file tail:
//
// variable: Footer
// variable: Metadata
// variable: PostScript - contains length of footer and metadata
// 1 byte: postScriptSize
// figure out the size of the file using the option or filesystem
long size = orcDataSource.getSize();
if (size <= PostScript.MAGIC.length()) {
throw new OrcCorruptionException(orcDataSource.getId(), "Invalid file size %s", size);
}
// Read the tail of the file
int expectedBufferSize = toIntExact(min(size, EXPECTED_FOOTER_SIZE));
Slice buffer = orcDataSource.readFully(size - expectedBufferSize, expectedBufferSize);
// get length of PostScript - last byte of the file
int postScriptSize = buffer.getUnsignedByte(buffer.length() - SIZE_OF_BYTE);
if (postScriptSize >= buffer.length()) {
throw new OrcCorruptionException(orcDataSource.getId(), "Invalid postscript length %s", postScriptSize);
}
MetadataReader metadataReader = new ExceptionWrappingMetadataReader(orcDataSource.getId(), new OrcMetadataReader());
// decode the post script
try {
orcFileTail.postScript = metadataReader.readPostScript(buffer.slice(buffer.length() - SIZE_OF_BYTE - postScriptSize, postScriptSize).getInput());
} catch (OrcCorruptionException e) {
// check if this is an ORC file and not an RCFile or something else
if (!isValidHeaderMagic(orcDataSource)) {
throw new OrcCorruptionException(orcDataSource.getId(), "Not an ORC file");
}
throw e;
}
// verify this is a supported version
checkOrcVersion(orcDataSource, orcFileTail.postScript.getVersion());
validateWrite(validation -> validation.getVersion().equals(orcFileTail.postScript.getVersion()), writeValidation, orcDataSource, "Unexpected version");
int bufferSize = toIntExact(orcFileTail.postScript.getCompressionBlockSize());
// check compression codec is supported
CompressionKind compressionKind = orcFileTail.postScript.getCompression();
orcFileTail.decompressor = OrcDecompressor.createOrcDecompressor(orcDataSource.getId(), compressionKind, bufferSize);
validateWrite(validation -> validation.getCompression() == compressionKind, writeValidation, orcDataSource, "Unexpected compression");
PostScript.HiveWriterVersion hiveWriterVersion = orcFileTail.postScript.getHiveWriterVersion();
int footerSize = toIntExact(orcFileTail.postScript.getFooterLength());
int metadataSize = toIntExact(orcFileTail.postScript.getMetadataLength());
// check if extra bytes need to be read
Slice completeFooterSlice;
int completeFooterSize = footerSize + metadataSize + postScriptSize + SIZE_OF_BYTE;
if (completeFooterSize > buffer.length()) {
// initial read was not large enough, so just read again with the correct size
completeFooterSlice = orcDataSource.readFully(size - completeFooterSize, completeFooterSize);
} else {
// footer is already in the bytes in buffer, just adjust position, length
completeFooterSlice = buffer.slice(buffer.length() - completeFooterSize, completeFooterSize);
}
// read metadata
Slice metadataSlice = completeFooterSlice.slice(0, metadataSize);
try (InputStream metadataInputStream = new OrcInputStream(OrcChunkLoader.create(orcDataSource.getId(), metadataSlice, orcFileTail.decompressor, newSimpleAggregatedMemoryContext()))) {
orcFileTail.metadata = metadataReader.readMetadata(hiveWriterVersion, metadataInputStream);
}
// read footer
Slice footerSlice = completeFooterSlice.slice(metadataSize, footerSize);
try (InputStream footerInputStream = new OrcInputStream(OrcChunkLoader.create(orcDataSource.getId(), footerSlice, orcFileTail.decompressor, newSimpleAggregatedMemoryContext()))) {
orcFileTail.footer = metadataReader.readFooter(hiveWriterVersion, footerInputStream);
}
if (orcFileTail.footer.getTypes().size() == 0) {
throw new OrcCorruptionException(orcDataSource.getId(), "File has no columns");
}
validateWrite(validation -> validation.getColumnNames().equals(orcFileTail.footer.getTypes().get(new OrcColumnId(0)).getFieldNames()), writeValidation, orcDataSource, "Unexpected column names");
validateWrite(validation -> validation.getRowGroupMaxRowCount() == orcFileTail.footer.getRowsInRowGroup(), writeValidation, orcDataSource, "Unexpected rows in group");
if (writeValidation.isPresent()) {
writeValidation.get().validateMetadata(orcDataSource.getId(), orcFileTail.footer.getUserMetadata());
writeValidation.get().validateFileStatistics(orcDataSource.getId(), orcFileTail.footer.getFileStats());
writeValidation.get().validateStripeStatistics(orcDataSource.getId(), orcFileTail.footer.getStripes(), orcFileTail.metadata.getStripeStatsList());
}
return orcFileTail;
}
Aggregations