use of com.facebook.presto.orc.metadata.CompressionKind in project presto by prestodb.
the class OrcTester method assertRoundTrip.
private void assertRoundTrip(List<Type> writeTypes, List<Type> readTypes, List<List<?>> writeValues, List<List<?>> readValues, boolean verifyWithHiveReader, List<OrcReaderSettings> settings) throws Exception {
assertEquals(writeTypes.size(), readTypes.size());
assertEquals(writeTypes.size(), writeValues.size());
assertEquals(writeTypes.size(), readValues.size());
OrcWriterStats stats = new OrcWriterStats();
for (Format format : formats) {
if (!readTypes.stream().allMatch(readType -> format.supportsType(readType))) {
return;
}
OrcEncoding orcEncoding = format.getOrcEncoding();
for (CompressionKind compression : compressions) {
boolean hiveSupported = (compression != LZ4) && (compression != ZSTD);
// write Hive, read Presto
if (hiveSupported) {
try (TempFile tempFile = new TempFile()) {
writeOrcColumnsHive(tempFile.getFile(), format, compression, writeTypes, writeValues);
assertFileContentsPresto(readTypes, tempFile, readValues, false, false, orcEncoding, format, true, useSelectiveOrcReader, settings, ImmutableMap.of());
}
}
// write Presto, read Hive and Presto
try (TempFile tempFile = new TempFile()) {
writeOrcColumnsPresto(tempFile.getFile(), format, compression, Optional.empty(), writeTypes, writeValues, stats);
if (verifyWithHiveReader && hiveSupported) {
assertFileContentsHive(readTypes, tempFile, format, readValues);
}
assertFileContentsPresto(readTypes, tempFile, readValues, false, false, orcEncoding, format, false, useSelectiveOrcReader, settings, ImmutableMap.of());
if (skipBatchTestsEnabled) {
assertFileContentsPresto(readTypes, tempFile, readValues, true, false, orcEncoding, format, false, useSelectiveOrcReader, settings, ImmutableMap.of());
}
if (skipStripeTestsEnabled) {
assertFileContentsPresto(readTypes, tempFile, readValues, false, true, orcEncoding, format, false, useSelectiveOrcReader, settings, ImmutableMap.of());
}
}
// write presto read presto
if (dwrfEncryptionEnabled && format == DWRF) {
try (TempFile tempFile = new TempFile()) {
DwrfWriterEncryption dwrfWriterEncryption = generateWriterEncryption();
writeOrcColumnsPresto(tempFile.getFile(), format, compression, Optional.of(dwrfWriterEncryption), writeTypes, writeValues, stats);
ImmutableMap.Builder<Integer, Slice> intermediateKeysBuilder = ImmutableMap.builder();
for (int i = 0; i < dwrfWriterEncryption.getWriterEncryptionGroups().size(); i++) {
for (Integer node : dwrfWriterEncryption.getWriterEncryptionGroups().get(i).getNodes()) {
intermediateKeysBuilder.put(node, dwrfWriterEncryption.getWriterEncryptionGroups().get(i).getIntermediateKeyMetadata());
}
}
Map<Integer, Slice> intermediateKeysMap = intermediateKeysBuilder.build();
assertFileContentsPresto(readTypes, tempFile, readValues, false, false, orcEncoding, format, false, useSelectiveOrcReader, settings, intermediateKeysMap);
if (skipBatchTestsEnabled) {
assertFileContentsPresto(readTypes, tempFile, readValues, true, false, orcEncoding, format, false, useSelectiveOrcReader, settings, intermediateKeysMap);
}
if (skipStripeTestsEnabled) {
assertFileContentsPresto(readTypes, tempFile, readValues, false, true, orcEncoding, format, false, useSelectiveOrcReader, settings, intermediateKeysMap);
}
}
}
}
}
assertEquals(stats.getWriterSizeInBytes(), 0);
}
use of com.facebook.presto.orc.metadata.CompressionKind in project presto by prestodb.
the class TestOrcSelectiveStreamReaders method testEmptyStrings.
/**
* This test tests SliceDirectSelectiveStreamReader for the case where all elements to read are empty strings. The output Block should be a valid VariableWidthBlock with an
* empty Slice. It is to simulate a problem seen in production. The state of SliceDirectSelectiveStreamReader to reproduce the problem is:
* - dataStream: null
* - presentStream: null
* - lengthStream: not null
* - filter: null
* - outputRequired: true
* - offsets array: non zeros
* The test issues two reads, the first one reads a non-empty string and populates non-zero offsets. The second one reads the empty string with the above conditions met.
*/
@Test
public void testEmptyStrings() throws Exception {
Type type = VARCHAR;
List<Type> types = ImmutableList.of(type);
List<List<?>> values = ImmutableList.of(ImmutableList.of("a", ""));
for (OrcTester.Format format : formats) {
if (!types.stream().allMatch(readType -> format.supportsType(readType))) {
return;
}
for (CompressionKind compression : compressions) {
TempFile tempFile = new TempFile();
writeOrcColumnsPresto(tempFile.getFile(), format, compression, Optional.empty(), types, values, new OrcWriterStats());
OrcPredicate orcPredicate = createOrcPredicate(types, values, DWRF, false);
Map<Integer, Type> includedColumns = IntStream.range(0, types.size()).boxed().collect(toImmutableMap(Function.identity(), types::get));
List<Integer> outputColumns = IntStream.range(0, types.size()).boxed().collect(toImmutableList());
OrcAggregatedMemoryContext systemMemoryUsage = new TestingHiveOrcAggregatedMemoryContext();
try (OrcSelectiveRecordReader recordReader = createCustomOrcSelectiveRecordReader(tempFile.getFile(), format.getOrcEncoding(), orcPredicate, types, 1, ImmutableMap.of(), ImmutableList.of(), ImmutableMap.of(), OrcTester.OrcReaderSettings.builder().build().getRequiredSubfields(), ImmutableMap.of(), ImmutableMap.of(), includedColumns, outputColumns, false, systemMemoryUsage, false)) {
assertEquals(recordReader.getReaderPosition(), 0);
assertEquals(recordReader.getFilePosition(), 0);
SelectiveStreamReader streamReader = recordReader.getStreamReaders()[0];
// Read the first non-empty element. Do not call streamReader.getBlock() to preserve the offsets array in SliceDirectSelectiveStreamReader.
int batchSize = min(recordReader.prepareNextBatch(), 1);
int[] positions = IntStream.range(0, batchSize).toArray();
streamReader.read(0, positions, batchSize);
recordReader.batchRead(batchSize);
// Read the second element: an empty string. Set the dataStream in SliceDirectSelectiveStreamReader to null to simulate the conditions causing the problem.
((SliceSelectiveStreamReader) streamReader).resetDataStream();
batchSize = min(recordReader.prepareNextBatch(), 1);
positions = IntStream.range(0, batchSize).toArray();
streamReader.read(0, positions, batchSize);
recordReader.batchRead(batchSize);
Block block = streamReader.getBlock(positions, batchSize);
List<?> expectedValues = ImmutableList.of("");
assertBlockEquals(type, block, expectedValues, 0);
assertEquals(recordReader.getReaderPosition(), 1);
assertEquals(recordReader.getFilePosition(), 1);
}
}
}
}
use of com.facebook.presto.orc.metadata.CompressionKind in project presto by prestodb.
the class OrcFileWriterFactory method getCompression.
private static CompressionKind getCompression(Properties schema, JobConf configuration, OrcEncoding orcEncoding) {
String compressionName = OrcConf.COMPRESS.getString(schema, configuration);
if (compressionName == null) {
return CompressionKind.ZLIB;
}
CompressionKind compression;
try {
compression = CompressionKind.valueOf(compressionName.toUpperCase(ENGLISH));
} catch (IllegalArgumentException e) {
throw new PrestoException(HIVE_UNSUPPORTED_FORMAT, "Unknown " + orcEncoding + " compression type " + compressionName);
}
return compression;
}
use of com.facebook.presto.orc.metadata.CompressionKind in project presto by prestodb.
the class OrcFileWriterFactory method createFileWriter.
@Override
public Optional<HiveFileWriter> createFileWriter(Path path, List<String> inputColumnNames, StorageFormat storageFormat, Properties schema, JobConf configuration, ConnectorSession session, Optional<EncryptionInformation> encryptionInformation) {
if (!HiveSessionProperties.isOrcOptimizedWriterEnabled(session)) {
return Optional.empty();
}
OrcEncoding orcEncoding;
if (OrcOutputFormat.class.getName().equals(storageFormat.getOutputFormat())) {
orcEncoding = ORC;
} else if (com.facebook.hive.orc.OrcOutputFormat.class.getName().equals(storageFormat.getOutputFormat())) {
orcEncoding = DWRF;
} else {
return Optional.empty();
}
CompressionKind compression = getCompression(schema, configuration, orcEncoding);
// existing tables and partitions may have columns in a different order than the writer is providing, so build
// an index to rearrange columns in the proper order
List<String> fileColumnNames = Splitter.on(',').trimResults().omitEmptyStrings().splitToList(schema.getProperty(META_TABLE_COLUMNS, ""));
List<Type> fileColumnTypes = toHiveTypes(schema.getProperty(META_TABLE_COLUMN_TYPES, "")).stream().map(hiveType -> hiveType.getType(typeManager)).collect(toList());
int[] fileInputColumnIndexes = fileColumnNames.stream().mapToInt(inputColumnNames::indexOf).toArray();
try {
FileSystem fileSystem = hdfsEnvironment.getFileSystem(session.getUser(), path, configuration);
DataSink dataSink = createDataSink(session, fileSystem, path);
Optional<Supplier<OrcDataSource>> validationInputFactory = Optional.empty();
if (HiveSessionProperties.isOrcOptimizedWriterValidate(session)) {
validationInputFactory = Optional.of(() -> {
try {
return new HdfsOrcDataSource(new OrcDataSourceId(path.toString()), fileSystem.getFileStatus(path).getLen(), getOrcMaxMergeDistance(session), getOrcMaxBufferSize(session), getOrcStreamBufferSize(session), false, fileSystem.open(path), readStats);
} catch (IOException e) {
throw new PrestoException(HIVE_WRITE_VALIDATION_FAILED, e);
}
});
}
Callable<Void> rollbackAction = () -> {
fileSystem.delete(path, false);
return null;
};
Optional<DwrfWriterEncryption> dwrfWriterEncryption = createDwrfEncryption(encryptionInformation, fileColumnNames, fileColumnTypes);
return Optional.of(new OrcFileWriter(dataSink, rollbackAction, orcEncoding, fileColumnNames, fileColumnTypes, compression, orcFileWriterConfig.toOrcWriterOptionsBuilder().withFlushPolicy(DefaultOrcWriterFlushPolicy.builder().withStripeMinSize(getOrcOptimizedWriterMinStripeSize(session)).withStripeMaxSize(getOrcOptimizedWriterMaxStripeSize(session)).withStripeMaxRowCount(getOrcOptimizedWriterMaxStripeRows(session)).build()).withDictionaryMaxMemory(getOrcOptimizedWriterMaxDictionaryMemory(session)).withMaxStringStatisticsLimit(getOrcStringStatisticsLimit(session)).withIgnoreDictionaryRowGroupSizes(isExecutionBasedMemoryAccountingEnabled(session)).withDwrfStripeCacheEnabled(isDwrfWriterStripeCacheEnabled(session)).withDwrfStripeCacheMaxSize(getDwrfWriterStripeCacheeMaxSize(session)).build(), fileInputColumnIndexes, ImmutableMap.<String, String>builder().put(HiveMetadata.PRESTO_VERSION_NAME, nodeVersion.toString()).put(MetastoreUtil.PRESTO_QUERY_ID_NAME, session.getQueryId()).build(), hiveStorageTimeZone, validationInputFactory, getOrcOptimizedWriterValidateMode(session), stats, dwrfEncryptionProvider, dwrfWriterEncryption));
} catch (IOException e) {
throw new PrestoException(HIVE_WRITER_OPEN_ERROR, "Error creating " + orcEncoding + " file. " + e.getMessage(), e);
}
}
use of com.facebook.presto.orc.metadata.CompressionKind in project presto by prestodb.
the class StorageOrcFileTailSource method getOrcFileTail.
@Override
public OrcFileTail getOrcFileTail(OrcDataSource orcDataSource, MetadataReader metadataReader, Optional<OrcWriteValidation> writeValidation, boolean cacheable) throws IOException {
long size = orcDataSource.getSize();
if (size <= MAGIC.length()) {
throw new OrcCorruptionException(orcDataSource.getId(), "Invalid file size %s", size);
}
// Read the tail of the file
byte[] buffer = new byte[toIntExact(min(size, expectedFooterSizeInBytes))];
orcDataSource.readFully(size - buffer.length, buffer);
// get length of PostScript - last byte of the file
int postScriptSize = buffer[buffer.length - SIZE_OF_BYTE] & 0xff;
if (postScriptSize >= buffer.length) {
throw new OrcCorruptionException(orcDataSource.getId(), "Invalid postscript length %s", postScriptSize);
}
// decode the post script
PostScript postScript;
try {
postScript = metadataReader.readPostScript(buffer, buffer.length - SIZE_OF_BYTE - postScriptSize, postScriptSize);
} catch (OrcCorruptionException e) {
// check if this is an ORC file and not an RCFile or something else
if (!isValidHeaderMagic(orcDataSource)) {
throw new OrcCorruptionException(orcDataSource.getId(), "Not an ORC file");
}
throw e;
}
// verify this is a supported version
checkOrcVersion(orcDataSource, postScript.getVersion());
validateWrite(writeValidation, orcDataSource, validation -> validation.getVersion().equals(postScript.getVersion()), "Unexpected version");
int bufferSize = toIntExact(postScript.getCompressionBlockSize());
// check compression codec is supported
CompressionKind compressionKind = postScript.getCompression();
validateWrite(writeValidation, orcDataSource, validation -> validation.getCompression() == compressionKind, "Unexpected compression");
PostScript.HiveWriterVersion hiveWriterVersion = postScript.getHiveWriterVersion();
int footerSize = toIntExact(postScript.getFooterLength());
int metadataSize = toIntExact(postScript.getMetadataLength());
if (footerSize < 0) {
throw new OrcCorruptionException(orcDataSource.getId(), "Invalid footer length %s", footerSize);
}
if (metadataSize < 0) {
throw new OrcCorruptionException(orcDataSource.getId(), "Invalid metadata length %s", metadataSize);
}
// read DWRF stripe cache only if this feature is enabled and it has meaningful data
boolean readDwrfStripeCache = dwrfStripeCacheEnabled && postScript.getDwrfStripeCacheLength().isPresent() && postScript.getDwrfStripeCacheMode().isPresent() && postScript.getDwrfStripeCacheMode().get() != DwrfStripeCacheMode.NONE;
int dwrfStripeCacheSize = 0;
if (readDwrfStripeCache) {
dwrfStripeCacheSize = postScript.getDwrfStripeCacheLength().getAsInt();
checkSizes(orcDataSource, metadataSize, dwrfStripeCacheSize);
}
// check if extra bytes need to be read
Slice completeFooterSlice;
int completeFooterSize = dwrfStripeCacheSize + metadataSize + footerSize + postScriptSize + SIZE_OF_BYTE;
if (completeFooterSize > buffer.length) {
// allocate a new buffer large enough for the complete footer
byte[] newBuffer = new byte[completeFooterSize];
completeFooterSlice = Slices.wrappedBuffer(newBuffer);
// initial read was not large enough, so read missing section
orcDataSource.readFully(size - completeFooterSize, newBuffer, 0, completeFooterSize - buffer.length);
// copy already read bytes into the new buffer
completeFooterSlice.setBytes(completeFooterSize - buffer.length, buffer);
} else {
// footer is already in the bytes in buffer, just adjust position, length
completeFooterSlice = Slices.wrappedBuffer(buffer, buffer.length - completeFooterSize, completeFooterSize);
}
// metadataSize is set only for ORC files, dwrfStripeCacheSize is set only for DWRF files
// it should be safe to sum them up to find footer offset
// TAIL: [ ORC_METADATA{0,1} | DWRF_STRIPE_CACHE {0,1} ] + FOOTER + POST_SCRIPT + POST_SCRIPT_SIZE (1 byte)
int footerSliceOffset = metadataSize + dwrfStripeCacheSize;
Slice footerSlice = completeFooterSlice.slice(footerSliceOffset, footerSize);
Slice metadataSlice = completeFooterSlice.slice(0, metadataSize);
// set DwrfStripeCacheData only if the stripe cache feature is enabled and the file has the stripe cache
Optional<DwrfStripeCacheData> dwrfStripeCacheData = Optional.empty();
if (readDwrfStripeCache) {
Slice dwrfStripeCacheSlice = completeFooterSlice.slice(0, dwrfStripeCacheSize);
DwrfStripeCacheMode stripeCacheMode = postScript.getDwrfStripeCacheMode().get();
dwrfStripeCacheData = Optional.of(new DwrfStripeCacheData(dwrfStripeCacheSlice, dwrfStripeCacheSize, stripeCacheMode));
}
return new OrcFileTail(hiveWriterVersion, bufferSize, compressionKind, footerSlice, footerSize, metadataSlice, metadataSize, dwrfStripeCacheData);
}
Aggregations