use of io.prestosql.orc.metadata.CompressionKind in project hetu-core by openlookeng.
the class OrcFileTail method readFrom.
public static OrcFileTail readFrom(OrcDataSource orcDataSource, Optional<OrcWriteValidation> writeValidation) throws IOException {
OrcFileTail orcFileTail = new OrcFileTail();
//
// Read the file tail:
//
// variable: Footer
// variable: Metadata
// variable: PostScript - contains length of footer and metadata
// 1 byte: postScriptSize
// figure out the size of the file using the option or filesystem
long size = orcDataSource.getSize();
if (size <= PostScript.MAGIC.length()) {
throw new OrcCorruptionException(orcDataSource.getId(), "Invalid file size %s", size);
}
// Read the tail of the file
int expectedBufferSize = toIntExact(min(size, EXPECTED_FOOTER_SIZE));
Slice buffer = orcDataSource.readFully(size - expectedBufferSize, expectedBufferSize);
// get length of PostScript - last byte of the file
int postScriptSize = buffer.getUnsignedByte(buffer.length() - SIZE_OF_BYTE);
if (postScriptSize >= buffer.length()) {
throw new OrcCorruptionException(orcDataSource.getId(), "Invalid postscript length %s", postScriptSize);
}
MetadataReader metadataReader = new ExceptionWrappingMetadataReader(orcDataSource.getId(), new OrcMetadataReader());
// decode the post script
try {
orcFileTail.postScript = metadataReader.readPostScript(buffer.slice(buffer.length() - SIZE_OF_BYTE - postScriptSize, postScriptSize).getInput());
} catch (OrcCorruptionException e) {
// check if this is an ORC file and not an RCFile or something else
if (!isValidHeaderMagic(orcDataSource)) {
throw new OrcCorruptionException(orcDataSource.getId(), "Not an ORC file");
}
throw e;
}
// verify this is a supported version
checkOrcVersion(orcDataSource, orcFileTail.postScript.getVersion());
validateWrite(validation -> validation.getVersion().equals(orcFileTail.postScript.getVersion()), writeValidation, orcDataSource, "Unexpected version");
int bufferSize = toIntExact(orcFileTail.postScript.getCompressionBlockSize());
// check compression codec is supported
CompressionKind compressionKind = orcFileTail.postScript.getCompression();
orcFileTail.decompressor = OrcDecompressor.createOrcDecompressor(orcDataSource.getId(), compressionKind, bufferSize);
validateWrite(validation -> validation.getCompression() == compressionKind, writeValidation, orcDataSource, "Unexpected compression");
PostScript.HiveWriterVersion hiveWriterVersion = orcFileTail.postScript.getHiveWriterVersion();
int footerSize = toIntExact(orcFileTail.postScript.getFooterLength());
int metadataSize = toIntExact(orcFileTail.postScript.getMetadataLength());
// check if extra bytes need to be read
Slice completeFooterSlice;
int completeFooterSize = footerSize + metadataSize + postScriptSize + SIZE_OF_BYTE;
if (completeFooterSize > buffer.length()) {
// initial read was not large enough, so just read again with the correct size
completeFooterSlice = orcDataSource.readFully(size - completeFooterSize, completeFooterSize);
} else {
// footer is already in the bytes in buffer, just adjust position, length
completeFooterSlice = buffer.slice(buffer.length() - completeFooterSize, completeFooterSize);
}
// read metadata
Slice metadataSlice = completeFooterSlice.slice(0, metadataSize);
try (InputStream metadataInputStream = new OrcInputStream(OrcChunkLoader.create(orcDataSource.getId(), metadataSlice, orcFileTail.decompressor, newSimpleAggregatedMemoryContext()))) {
orcFileTail.metadata = metadataReader.readMetadata(hiveWriterVersion, metadataInputStream);
}
// read footer
Slice footerSlice = completeFooterSlice.slice(metadataSize, footerSize);
try (InputStream footerInputStream = new OrcInputStream(OrcChunkLoader.create(orcDataSource.getId(), footerSlice, orcFileTail.decompressor, newSimpleAggregatedMemoryContext()))) {
orcFileTail.footer = metadataReader.readFooter(hiveWriterVersion, footerInputStream);
}
if (orcFileTail.footer.getTypes().size() == 0) {
throw new OrcCorruptionException(orcDataSource.getId(), "File has no columns");
}
validateWrite(validation -> validation.getColumnNames().equals(orcFileTail.footer.getTypes().get(new OrcColumnId(0)).getFieldNames()), writeValidation, orcDataSource, "Unexpected column names");
validateWrite(validation -> validation.getRowGroupMaxRowCount() == orcFileTail.footer.getRowsInRowGroup(), writeValidation, orcDataSource, "Unexpected rows in group");
if (writeValidation.isPresent()) {
writeValidation.get().validateMetadata(orcDataSource.getId(), orcFileTail.footer.getUserMetadata());
writeValidation.get().validateFileStatistics(orcDataSource.getId(), orcFileTail.footer.getFileStats());
writeValidation.get().validateStripeStatistics(orcDataSource.getId(), orcFileTail.footer.getStripes(), orcFileTail.metadata.getStripeStatsList());
}
return orcFileTail;
}
use of io.prestosql.orc.metadata.CompressionKind in project hetu-core by openlookeng.
the class OrcTester method assertRoundTrip.
private void assertRoundTrip(Type writeType, Type readType, List<?> writeValues, List<?> readValues, List<Map<Integer, TupleDomainFilter>> filters) throws Exception {
OrcWriterStats stats = new OrcWriterStats();
for (CompressionKind compression : compressions) {
boolean hiveSupported = (compression != LZ4) && (compression != ZSTD);
for (Format format : formats) {
// write Hive, read Presto
if (hiveSupported) {
try (TempFile tempFile = new TempFile()) {
writeOrcColumnHive(tempFile.getFile(), format, compression, writeType, writeValues.iterator());
assertFileContentsPresto(readType, tempFile, readValues, false, false, useSelectiveOrcReader, filters);
}
}
}
// write Presto, read Hive and Presto
try (TempFile tempFile = new TempFile()) {
writeOrcColumnPresto(tempFile.getFile(), compression, writeType, writeValues.iterator(), stats);
if (hiveSupported) {
assertFileContentsHive(readType, tempFile, readValues);
}
assertFileContentsPresto(readType, tempFile, readValues, false, false, useSelectiveOrcReader, filters);
if (skipBatchTestsEnabled) {
assertFileContentsPresto(readType, tempFile, readValues, true, false, useSelectiveOrcReader, filters);
}
if (skipStripeTestsEnabled) {
assertFileContentsPresto(readType, tempFile, readValues, false, true, useSelectiveOrcReader, filters);
}
}
}
assertEquals(stats.getWriterSizeInBytes(), 0);
}
use of io.prestosql.orc.metadata.CompressionKind in project hetu-core by openlookeng.
the class OrcFileWriterFactory method getCompression.
private static CompressionKind getCompression(Properties schema, JobConf configuration) {
String compressionName = OrcConf.COMPRESS.getString(schema, configuration);
if (compressionName == null) {
return CompressionKind.ZLIB;
}
CompressionKind compression;
try {
compression = CompressionKind.valueOf(compressionName.toUpperCase(ENGLISH));
} catch (IllegalArgumentException e) {
throw new PrestoException(HiveErrorCode.HIVE_UNSUPPORTED_FORMAT, "Unknown ORC compression type " + compressionName);
}
return compression;
}
use of io.prestosql.orc.metadata.CompressionKind in project hetu-core by openlookeng.
the class OrcFileWriterFactory method createFileWriter.
@Override
public Optional<HiveFileWriter> createFileWriter(Path path, List<String> inputColumnNames, StorageFormat storageFormat, Properties schema, JobConf configuration, ConnectorSession session, Optional<AcidOutputFormat.Options> acidOptions, Optional<HiveACIDWriteType> acidWriteType) {
if (!OrcOutputFormat.class.getName().equals(storageFormat.getOutputFormat())) {
return Optional.empty();
}
CompressionKind compression = getCompression(schema, configuration);
// existing tables and partitions may have columns in a different order than the writer is providing, so build
// an index to rearrange columns in the proper order
List<String> fileColumnNames = getColumnNames(schema);
List<Type> fileColumnTypes = getColumnTypes(schema).stream().map(hiveType -> hiveType.getType(typeManager)).collect(toList());
List<Type> dataFileColumnTypes = fileColumnTypes;
int[] fileInputColumnIndexes = fileColumnNames.stream().mapToInt(inputColumnNames::indexOf).toArray();
Optional<HiveFileWriter> deleteDeltaWriter = Optional.empty();
if (AcidUtils.isTablePropertyTransactional(schema) && !AcidUtils.isInsertOnlyTable(schema)) {
ImmutableList<String> orcFileColumnNames = ImmutableList.of(OrcPageSourceFactory.ACID_COLUMN_OPERATION, OrcPageSourceFactory.ACID_COLUMN_ORIGINAL_TRANSACTION, OrcPageSourceFactory.ACID_COLUMN_BUCKET, OrcPageSourceFactory.ACID_COLUMN_ROW_ID, OrcPageSourceFactory.ACID_COLUMN_CURRENT_TRANSACTION, OrcPageSourceFactory.ACID_COLUMN_ROW_STRUCT);
ImmutableList.Builder<RowType.Field> fieldsBuilder = ImmutableList.builder();
for (int i = 0; i < fileColumnNames.size(); i++) {
fieldsBuilder.add(new RowType.Field(Optional.of(fileColumnNames.get(i)), fileColumnTypes.get(i)));
}
ImmutableList<Type> orcFileColumnTypes = ImmutableList.of(INTEGER, BIGINT, INTEGER, BIGINT, BIGINT, RowType.from(fieldsBuilder.build()));
fileColumnNames = orcFileColumnNames;
fileColumnTypes = orcFileColumnTypes;
if (acidWriteType.isPresent() && acidWriteType.get() == HiveACIDWriteType.UPDATE) {
AcidOutputFormat.Options deleteOptions = acidOptions.get().clone().writingDeleteDelta(true);
Path deleteDeltaPath = AcidUtils.createFilename(path.getParent().getParent(), deleteOptions);
deleteDeltaWriter = createFileWriter(deleteDeltaPath, inputColumnNames, storageFormat, schema, configuration, session, Optional.of(deleteOptions), Optional.of(HiveACIDWriteType.DELETE));
}
}
try {
FileSystem fileSystem = hdfsEnvironment.getFileSystem(session.getUser(), path, configuration);
OrcDataSink orcDataSink = createOrcDataSink(session, fileSystem, path);
Optional<Supplier<OrcDataSource>> validationInputFactory = Optional.empty();
if (HiveSessionProperties.isOrcOptimizedWriterValidate(session)) {
validationInputFactory = Optional.of(() -> {
try {
FileStatus fileStatus = fileSystem.getFileStatus(path);
return new HdfsOrcDataSource(new OrcDataSourceId(path.toString()), fileStatus.getLen(), HiveSessionProperties.getOrcMaxMergeDistance(session), HiveSessionProperties.getOrcMaxBufferSize(session), HiveSessionProperties.getOrcStreamBufferSize(session), false, fileSystem.open(path), readStats, fileStatus.getModificationTime());
} catch (IOException e) {
throw new PrestoException(HiveErrorCode.HIVE_WRITE_VALIDATION_FAILED, e);
}
});
}
Callable<Void> rollbackAction = () -> {
fileSystem.delete(path, false);
return null;
};
return Optional.of(new OrcFileWriter(orcDataSink, rollbackAction, fileColumnNames, fileColumnTypes, dataFileColumnTypes, compression, orcWriterOptions.withStripeMinSize(HiveSessionProperties.getOrcOptimizedWriterMinStripeSize(session)).withStripeMaxSize(HiveSessionProperties.getOrcOptimizedWriterMaxStripeSize(session)).withStripeMaxRowCount(HiveSessionProperties.getOrcOptimizedWriterMaxStripeRows(session)).withDictionaryMaxMemory(HiveSessionProperties.getOrcOptimizedWriterMaxDictionaryMemory(session)).withMaxStringStatisticsLimit(HiveSessionProperties.getOrcStringStatisticsLimit(session)), writeLegacyVersion, fileInputColumnIndexes, ImmutableMap.<String, String>builder().put(HiveMetadata.PRESTO_VERSION_NAME, nodeVersion.toString()).put(HiveMetadata.PRESTO_QUERY_ID_NAME, session.getQueryId()).put("hive.acid.version", String.valueOf(AcidUtils.OrcAcidVersion.ORC_ACID_VERSION)).build(), validationInputFactory, HiveSessionProperties.getOrcOptimizedWriterValidateMode(session), stats, acidOptions, acidWriteType, deleteDeltaWriter, path));
} catch (IOException e) {
throw new PrestoException(HiveErrorCode.HIVE_WRITER_OPEN_ERROR, "Error creating ORC file", e);
}
}
Aggregations