use of io.prestosql.orc.metadata.OrcMetadataReader in project hetu-core by openlookeng.
the class OrcFileTail method readFrom.
public static OrcFileTail readFrom(OrcDataSource orcDataSource, Optional<OrcWriteValidation> writeValidation) throws IOException {
OrcFileTail orcFileTail = new OrcFileTail();
//
// Read the file tail:
//
// variable: Footer
// variable: Metadata
// variable: PostScript - contains length of footer and metadata
// 1 byte: postScriptSize
// figure out the size of the file using the option or filesystem
long size = orcDataSource.getSize();
if (size <= PostScript.MAGIC.length()) {
throw new OrcCorruptionException(orcDataSource.getId(), "Invalid file size %s", size);
}
// Read the tail of the file
int expectedBufferSize = toIntExact(min(size, EXPECTED_FOOTER_SIZE));
Slice buffer = orcDataSource.readFully(size - expectedBufferSize, expectedBufferSize);
// get length of PostScript - last byte of the file
int postScriptSize = buffer.getUnsignedByte(buffer.length() - SIZE_OF_BYTE);
if (postScriptSize >= buffer.length()) {
throw new OrcCorruptionException(orcDataSource.getId(), "Invalid postscript length %s", postScriptSize);
}
MetadataReader metadataReader = new ExceptionWrappingMetadataReader(orcDataSource.getId(), new OrcMetadataReader());
// decode the post script
try {
orcFileTail.postScript = metadataReader.readPostScript(buffer.slice(buffer.length() - SIZE_OF_BYTE - postScriptSize, postScriptSize).getInput());
} catch (OrcCorruptionException e) {
// check if this is an ORC file and not an RCFile or something else
if (!isValidHeaderMagic(orcDataSource)) {
throw new OrcCorruptionException(orcDataSource.getId(), "Not an ORC file");
}
throw e;
}
// verify this is a supported version
checkOrcVersion(orcDataSource, orcFileTail.postScript.getVersion());
validateWrite(validation -> validation.getVersion().equals(orcFileTail.postScript.getVersion()), writeValidation, orcDataSource, "Unexpected version");
int bufferSize = toIntExact(orcFileTail.postScript.getCompressionBlockSize());
// check compression codec is supported
CompressionKind compressionKind = orcFileTail.postScript.getCompression();
orcFileTail.decompressor = OrcDecompressor.createOrcDecompressor(orcDataSource.getId(), compressionKind, bufferSize);
validateWrite(validation -> validation.getCompression() == compressionKind, writeValidation, orcDataSource, "Unexpected compression");
PostScript.HiveWriterVersion hiveWriterVersion = orcFileTail.postScript.getHiveWriterVersion();
int footerSize = toIntExact(orcFileTail.postScript.getFooterLength());
int metadataSize = toIntExact(orcFileTail.postScript.getMetadataLength());
// check if extra bytes need to be read
Slice completeFooterSlice;
int completeFooterSize = footerSize + metadataSize + postScriptSize + SIZE_OF_BYTE;
if (completeFooterSize > buffer.length()) {
// initial read was not large enough, so just read again with the correct size
completeFooterSlice = orcDataSource.readFully(size - completeFooterSize, completeFooterSize);
} else {
// footer is already in the bytes in buffer, just adjust position, length
completeFooterSlice = buffer.slice(buffer.length() - completeFooterSize, completeFooterSize);
}
// read metadata
Slice metadataSlice = completeFooterSlice.slice(0, metadataSize);
try (InputStream metadataInputStream = new OrcInputStream(OrcChunkLoader.create(orcDataSource.getId(), metadataSlice, orcFileTail.decompressor, newSimpleAggregatedMemoryContext()))) {
orcFileTail.metadata = metadataReader.readMetadata(hiveWriterVersion, metadataInputStream);
}
// read footer
Slice footerSlice = completeFooterSlice.slice(metadataSize, footerSize);
try (InputStream footerInputStream = new OrcInputStream(OrcChunkLoader.create(orcDataSource.getId(), footerSlice, orcFileTail.decompressor, newSimpleAggregatedMemoryContext()))) {
orcFileTail.footer = metadataReader.readFooter(hiveWriterVersion, footerInputStream);
}
if (orcFileTail.footer.getTypes().size() == 0) {
throw new OrcCorruptionException(orcDataSource.getId(), "File has no columns");
}
validateWrite(validation -> validation.getColumnNames().equals(orcFileTail.footer.getTypes().get(new OrcColumnId(0)).getFieldNames()), writeValidation, orcDataSource, "Unexpected column names");
validateWrite(validation -> validation.getRowGroupMaxRowCount() == orcFileTail.footer.getRowsInRowGroup(), writeValidation, orcDataSource, "Unexpected rows in group");
if (writeValidation.isPresent()) {
writeValidation.get().validateMetadata(orcDataSource.getId(), orcFileTail.footer.getUserMetadata());
writeValidation.get().validateFileStatistics(orcDataSource.getId(), orcFileTail.footer.getFileStats());
writeValidation.get().validateStripeStatistics(orcDataSource.getId(), orcFileTail.footer.getStripes(), orcFileTail.metadata.getStripeStatsList());
}
return orcFileTail;
}
use of io.prestosql.orc.metadata.OrcMetadataReader in project hetu-core by openlookeng.
the class TestOrcBloomFilters method testOrcHiveBloomFilterSerde.
@Test
public void testOrcHiveBloomFilterSerde() throws Exception {
HashableBloomFilter bloomFilterWrite = new HashableBloomFilter(1000L, 0.05);
bloomFilterWrite.add(TEST_STRING);
assertTrue(bloomFilterWrite.test(TEST_STRING));
assertTrue(bloomFilterWrite.test(wrappedBuffer(TEST_STRING)));
OrcProto.BloomFilter.Builder bloomFilterBuilder = OrcProto.BloomFilter.newBuilder();
bloomFilterBuilder.addAllBitset(Longs.asList(bloomFilterWrite.getBitSet()));
bloomFilterBuilder.setNumHashFunctions(bloomFilterWrite.getNumHashFunctions());
OrcProto.BloomFilter bloomFilter = bloomFilterBuilder.build();
OrcProto.BloomFilterIndex bloomFilterIndex = OrcProto.BloomFilterIndex.getDefaultInstance();
byte[] bytes = serializeBloomFilterToIndex(bloomFilter, bloomFilterIndex);
// Read through method
InputStream inputStream = new ByteArrayInputStream(bytes);
OrcMetadataReader metadataReader = new OrcMetadataReader();
List<HashableBloomFilter> bloomFilters = metadataReader.readBloomFilterIndexes(inputStream);
assertEquals(bloomFilters.size(), 1);
assertTrue(bloomFilters.get(0).test(TEST_STRING));
assertTrue(bloomFilters.get(0).test(wrappedBuffer(TEST_STRING)));
assertFalse(bloomFilters.get(0).test(TEST_STRING_NOT_WRITTEN));
assertFalse(bloomFilters.get(0).test(wrappedBuffer(TEST_STRING_NOT_WRITTEN)));
assertEquals(bloomFilterWrite.getNumBits(), bloomFilters.get(0).getNumBits());
assertEquals(bloomFilterWrite.getNumHashFunctions(), bloomFilters.get(0).getNumHashFunctions());
// Validate bit set
assertTrue(Arrays.equals(bloomFilters.get(0).getBitSet(), bloomFilterWrite.getBitSet()));
// Read directly: allows better inspection of the bit sets (helped to fix a lot of bugs)
CodedInputStream input = CodedInputStream.newInstance(bytes);
OrcProto.BloomFilterIndex deserializedBloomFilterIndex = OrcProto.BloomFilterIndex.parseFrom(input);
List<OrcProto.BloomFilter> bloomFilterList = deserializedBloomFilterIndex.getBloomFilterList();
assertEquals(bloomFilterList.size(), 1);
OrcProto.BloomFilter bloomFilterRead = bloomFilterList.get(0);
// Validate contents of ORC bloom filter bit set
assertTrue(Arrays.equals(Longs.toArray(bloomFilterRead.getBitsetList()), bloomFilterWrite.getBitSet()));
// hash functions
assertEquals(bloomFilterWrite.getNumHashFunctions(), bloomFilterRead.getNumHashFunctions());
// bit size
assertEquals(bloomFilterWrite.getBitSet().length, bloomFilterRead.getBitsetCount());
}
use of io.prestosql.orc.metadata.OrcMetadataReader in project hetu-core by openlookeng.
the class TestOrcWriter method testWriteOutputStreamsInOrder.
@Test
public void testWriteOutputStreamsInOrder() throws IOException {
for (OrcWriteValidationMode validationMode : OrcWriteValidationMode.values()) {
TempFile tempFile = new TempFile();
OrcWriter writer = new OrcWriter(new OutputStreamOrcDataSink(new FileOutputStream(tempFile.getFile())), ImmutableList.of("test1", "test2", "test3", "test4", "test5"), ImmutableList.of(VARCHAR, VARCHAR, VARCHAR, VARCHAR, VARCHAR), NONE, new OrcWriterOptions().withStripeMinSize(new DataSize(0, MEGABYTE)).withStripeMaxSize(new DataSize(32, MEGABYTE)).withStripeMaxRowCount(ORC_STRIPE_SIZE).withRowGroupMaxRowCount(ORC_ROW_GROUP_SIZE).withDictionaryMaxMemory(new DataSize(32, MEGABYTE)), false, ImmutableMap.of(), true, validationMode, new OrcWriterStats(), Optional.empty(), Optional.empty());
// write down some data with unsorted streams
String[] data = new String[] { "a", "bbbbb", "ccc", "dd", "eeee" };
Block[] blocks = new Block[data.length];
int entries = 65536;
BlockBuilder blockBuilder = VARCHAR.createBlockBuilder(null, entries);
for (int i = 0; i < data.length; i++) {
byte[] bytes = data[i].getBytes();
for (int j = 0; j < entries; j++) {
// force to write different data
bytes[0] = (byte) ((bytes[0] + 1) % 128);
blockBuilder.writeBytes(Slices.wrappedBuffer(bytes, 0, bytes.length), 0, bytes.length);
blockBuilder.closeEntry();
}
blocks[i] = blockBuilder.build();
blockBuilder = blockBuilder.newBlockBuilderLike(null);
}
writer.write(new Page(blocks));
writer.close();
// read the footer and verify the streams are ordered by size
DataSize dataSize = new DataSize(1, MEGABYTE);
OrcDataSource orcDataSource = new FileOrcDataSource(tempFile.getFile(), dataSize, dataSize, dataSize, true, tempFile.getFile().lastModified());
Footer footer = new OrcReader(orcDataSource, dataSize, dataSize, dataSize).getFooter();
for (StripeInformation stripe : footer.getStripes()) {
// read the footer
Slice tailBuffer = orcDataSource.readFully(stripe.getOffset() + stripe.getIndexLength() + stripe.getDataLength(), toIntExact(stripe.getFooterLength()));
try (InputStream inputStream = new OrcInputStream(OrcChunkLoader.create(orcDataSource.getId(), tailBuffer, Optional.empty(), newSimpleAggregatedMemoryContext()))) {
StripeFooter stripeFooter = new OrcMetadataReader().readStripeFooter(footer.getTypes(), inputStream, ZoneId.of("UTC"));
int size = 0;
boolean dataStreamStarted = false;
for (Stream stream : stripeFooter.getStreams()) {
if (isIndexStream(stream)) {
assertFalse(dataStreamStarted);
continue;
}
dataStreamStarted = true;
// verify sizes in order
assertGreaterThanOrEqual(stream.getLength(), size);
size = stream.getLength();
}
}
}
}
}
Aggregations