Search in sources :

Example 1 with CompressionKind

use of io.prestosql.orc.metadata.CompressionKind in project hetu-core by openlookeng.

the class OrcFileTail method readFrom.

public static OrcFileTail readFrom(OrcDataSource orcDataSource, Optional<OrcWriteValidation> writeValidation) throws IOException {
    OrcFileTail orcFileTail = new OrcFileTail();
    // 
    // Read the file tail:
    // 
    // variable: Footer
    // variable: Metadata
    // variable: PostScript - contains length of footer and metadata
    // 1 byte: postScriptSize
    // figure out the size of the file using the option or filesystem
    long size = orcDataSource.getSize();
    if (size <= PostScript.MAGIC.length()) {
        throw new OrcCorruptionException(orcDataSource.getId(), "Invalid file size %s", size);
    }
    // Read the tail of the file
    int expectedBufferSize = toIntExact(min(size, EXPECTED_FOOTER_SIZE));
    Slice buffer = orcDataSource.readFully(size - expectedBufferSize, expectedBufferSize);
    // get length of PostScript - last byte of the file
    int postScriptSize = buffer.getUnsignedByte(buffer.length() - SIZE_OF_BYTE);
    if (postScriptSize >= buffer.length()) {
        throw new OrcCorruptionException(orcDataSource.getId(), "Invalid postscript length %s", postScriptSize);
    }
    MetadataReader metadataReader = new ExceptionWrappingMetadataReader(orcDataSource.getId(), new OrcMetadataReader());
    // decode the post script
    try {
        orcFileTail.postScript = metadataReader.readPostScript(buffer.slice(buffer.length() - SIZE_OF_BYTE - postScriptSize, postScriptSize).getInput());
    } catch (OrcCorruptionException e) {
        // check if this is an ORC file and not an RCFile or something else
        if (!isValidHeaderMagic(orcDataSource)) {
            throw new OrcCorruptionException(orcDataSource.getId(), "Not an ORC file");
        }
        throw e;
    }
    // verify this is a supported version
    checkOrcVersion(orcDataSource, orcFileTail.postScript.getVersion());
    validateWrite(validation -> validation.getVersion().equals(orcFileTail.postScript.getVersion()), writeValidation, orcDataSource, "Unexpected version");
    int bufferSize = toIntExact(orcFileTail.postScript.getCompressionBlockSize());
    // check compression codec is supported
    CompressionKind compressionKind = orcFileTail.postScript.getCompression();
    orcFileTail.decompressor = OrcDecompressor.createOrcDecompressor(orcDataSource.getId(), compressionKind, bufferSize);
    validateWrite(validation -> validation.getCompression() == compressionKind, writeValidation, orcDataSource, "Unexpected compression");
    PostScript.HiveWriterVersion hiveWriterVersion = orcFileTail.postScript.getHiveWriterVersion();
    int footerSize = toIntExact(orcFileTail.postScript.getFooterLength());
    int metadataSize = toIntExact(orcFileTail.postScript.getMetadataLength());
    // check if extra bytes need to be read
    Slice completeFooterSlice;
    int completeFooterSize = footerSize + metadataSize + postScriptSize + SIZE_OF_BYTE;
    if (completeFooterSize > buffer.length()) {
        // initial read was not large enough, so just read again with the correct size
        completeFooterSlice = orcDataSource.readFully(size - completeFooterSize, completeFooterSize);
    } else {
        // footer is already in the bytes in buffer, just adjust position, length
        completeFooterSlice = buffer.slice(buffer.length() - completeFooterSize, completeFooterSize);
    }
    // read metadata
    Slice metadataSlice = completeFooterSlice.slice(0, metadataSize);
    try (InputStream metadataInputStream = new OrcInputStream(OrcChunkLoader.create(orcDataSource.getId(), metadataSlice, orcFileTail.decompressor, newSimpleAggregatedMemoryContext()))) {
        orcFileTail.metadata = metadataReader.readMetadata(hiveWriterVersion, metadataInputStream);
    }
    // read footer
    Slice footerSlice = completeFooterSlice.slice(metadataSize, footerSize);
    try (InputStream footerInputStream = new OrcInputStream(OrcChunkLoader.create(orcDataSource.getId(), footerSlice, orcFileTail.decompressor, newSimpleAggregatedMemoryContext()))) {
        orcFileTail.footer = metadataReader.readFooter(hiveWriterVersion, footerInputStream);
    }
    if (orcFileTail.footer.getTypes().size() == 0) {
        throw new OrcCorruptionException(orcDataSource.getId(), "File has no columns");
    }
    validateWrite(validation -> validation.getColumnNames().equals(orcFileTail.footer.getTypes().get(new OrcColumnId(0)).getFieldNames()), writeValidation, orcDataSource, "Unexpected column names");
    validateWrite(validation -> validation.getRowGroupMaxRowCount() == orcFileTail.footer.getRowsInRowGroup(), writeValidation, orcDataSource, "Unexpected rows in group");
    if (writeValidation.isPresent()) {
        writeValidation.get().validateMetadata(orcDataSource.getId(), orcFileTail.footer.getUserMetadata());
        writeValidation.get().validateFileStatistics(orcDataSource.getId(), orcFileTail.footer.getFileStats());
        writeValidation.get().validateStripeStatistics(orcDataSource.getId(), orcFileTail.footer.getStripes(), orcFileTail.metadata.getStripeStatsList());
    }
    return orcFileTail;
}
Also used : OrcColumnId(io.prestosql.orc.metadata.OrcColumnId) CompressionKind(io.prestosql.orc.metadata.CompressionKind) OrcInputStream(io.prestosql.orc.stream.OrcInputStream) OrcInputStream(io.prestosql.orc.stream.OrcInputStream) InputStream(java.io.InputStream) OrcMetadataReader(io.prestosql.orc.metadata.OrcMetadataReader) ExceptionWrappingMetadataReader(io.prestosql.orc.metadata.ExceptionWrappingMetadataReader) OrcMetadataReader(io.prestosql.orc.metadata.OrcMetadataReader) MetadataReader(io.prestosql.orc.metadata.MetadataReader) PostScript(io.prestosql.orc.metadata.PostScript) Slice(io.airlift.slice.Slice) ExceptionWrappingMetadataReader(io.prestosql.orc.metadata.ExceptionWrappingMetadataReader)

Example 2 with CompressionKind

use of io.prestosql.orc.metadata.CompressionKind in project hetu-core by openlookeng.

the class OrcTester method assertRoundTrip.

private void assertRoundTrip(Type writeType, Type readType, List<?> writeValues, List<?> readValues, List<Map<Integer, TupleDomainFilter>> filters) throws Exception {
    OrcWriterStats stats = new OrcWriterStats();
    for (CompressionKind compression : compressions) {
        boolean hiveSupported = (compression != LZ4) && (compression != ZSTD);
        for (Format format : formats) {
            // write Hive, read Presto
            if (hiveSupported) {
                try (TempFile tempFile = new TempFile()) {
                    writeOrcColumnHive(tempFile.getFile(), format, compression, writeType, writeValues.iterator());
                    assertFileContentsPresto(readType, tempFile, readValues, false, false, useSelectiveOrcReader, filters);
                }
            }
        }
        // write Presto, read Hive and Presto
        try (TempFile tempFile = new TempFile()) {
            writeOrcColumnPresto(tempFile.getFile(), compression, writeType, writeValues.iterator(), stats);
            if (hiveSupported) {
                assertFileContentsHive(readType, tempFile, readValues);
            }
            assertFileContentsPresto(readType, tempFile, readValues, false, false, useSelectiveOrcReader, filters);
            if (skipBatchTestsEnabled) {
                assertFileContentsPresto(readType, tempFile, readValues, true, false, useSelectiveOrcReader, filters);
            }
            if (skipStripeTestsEnabled) {
                assertFileContentsPresto(readType, tempFile, readValues, false, true, useSelectiveOrcReader, filters);
            }
        }
    }
    assertEquals(stats.getWriterSizeInBytes(), 0);
}
Also used : CompressionKind(io.prestosql.orc.metadata.CompressionKind) OrcOutputFormat(org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat)

Example 3 with CompressionKind

use of io.prestosql.orc.metadata.CompressionKind in project hetu-core by openlookeng.

the class OrcFileWriterFactory method getCompression.

private static CompressionKind getCompression(Properties schema, JobConf configuration) {
    String compressionName = OrcConf.COMPRESS.getString(schema, configuration);
    if (compressionName == null) {
        return CompressionKind.ZLIB;
    }
    CompressionKind compression;
    try {
        compression = CompressionKind.valueOf(compressionName.toUpperCase(ENGLISH));
    } catch (IllegalArgumentException e) {
        throw new PrestoException(HiveErrorCode.HIVE_UNSUPPORTED_FORMAT, "Unknown ORC compression type " + compressionName);
    }
    return compression;
}
Also used : CompressionKind(io.prestosql.orc.metadata.CompressionKind) PrestoException(io.prestosql.spi.PrestoException)

Example 4 with CompressionKind

use of io.prestosql.orc.metadata.CompressionKind in project hetu-core by openlookeng.

the class OrcFileWriterFactory method createFileWriter.

@Override
public Optional<HiveFileWriter> createFileWriter(Path path, List<String> inputColumnNames, StorageFormat storageFormat, Properties schema, JobConf configuration, ConnectorSession session, Optional<AcidOutputFormat.Options> acidOptions, Optional<HiveACIDWriteType> acidWriteType) {
    if (!OrcOutputFormat.class.getName().equals(storageFormat.getOutputFormat())) {
        return Optional.empty();
    }
    CompressionKind compression = getCompression(schema, configuration);
    // existing tables and partitions may have columns in a different order than the writer is providing, so build
    // an index to rearrange columns in the proper order
    List<String> fileColumnNames = getColumnNames(schema);
    List<Type> fileColumnTypes = getColumnTypes(schema).stream().map(hiveType -> hiveType.getType(typeManager)).collect(toList());
    List<Type> dataFileColumnTypes = fileColumnTypes;
    int[] fileInputColumnIndexes = fileColumnNames.stream().mapToInt(inputColumnNames::indexOf).toArray();
    Optional<HiveFileWriter> deleteDeltaWriter = Optional.empty();
    if (AcidUtils.isTablePropertyTransactional(schema) && !AcidUtils.isInsertOnlyTable(schema)) {
        ImmutableList<String> orcFileColumnNames = ImmutableList.of(OrcPageSourceFactory.ACID_COLUMN_OPERATION, OrcPageSourceFactory.ACID_COLUMN_ORIGINAL_TRANSACTION, OrcPageSourceFactory.ACID_COLUMN_BUCKET, OrcPageSourceFactory.ACID_COLUMN_ROW_ID, OrcPageSourceFactory.ACID_COLUMN_CURRENT_TRANSACTION, OrcPageSourceFactory.ACID_COLUMN_ROW_STRUCT);
        ImmutableList.Builder<RowType.Field> fieldsBuilder = ImmutableList.builder();
        for (int i = 0; i < fileColumnNames.size(); i++) {
            fieldsBuilder.add(new RowType.Field(Optional.of(fileColumnNames.get(i)), fileColumnTypes.get(i)));
        }
        ImmutableList<Type> orcFileColumnTypes = ImmutableList.of(INTEGER, BIGINT, INTEGER, BIGINT, BIGINT, RowType.from(fieldsBuilder.build()));
        fileColumnNames = orcFileColumnNames;
        fileColumnTypes = orcFileColumnTypes;
        if (acidWriteType.isPresent() && acidWriteType.get() == HiveACIDWriteType.UPDATE) {
            AcidOutputFormat.Options deleteOptions = acidOptions.get().clone().writingDeleteDelta(true);
            Path deleteDeltaPath = AcidUtils.createFilename(path.getParent().getParent(), deleteOptions);
            deleteDeltaWriter = createFileWriter(deleteDeltaPath, inputColumnNames, storageFormat, schema, configuration, session, Optional.of(deleteOptions), Optional.of(HiveACIDWriteType.DELETE));
        }
    }
    try {
        FileSystem fileSystem = hdfsEnvironment.getFileSystem(session.getUser(), path, configuration);
        OrcDataSink orcDataSink = createOrcDataSink(session, fileSystem, path);
        Optional<Supplier<OrcDataSource>> validationInputFactory = Optional.empty();
        if (HiveSessionProperties.isOrcOptimizedWriterValidate(session)) {
            validationInputFactory = Optional.of(() -> {
                try {
                    FileStatus fileStatus = fileSystem.getFileStatus(path);
                    return new HdfsOrcDataSource(new OrcDataSourceId(path.toString()), fileStatus.getLen(), HiveSessionProperties.getOrcMaxMergeDistance(session), HiveSessionProperties.getOrcMaxBufferSize(session), HiveSessionProperties.getOrcStreamBufferSize(session), false, fileSystem.open(path), readStats, fileStatus.getModificationTime());
                } catch (IOException e) {
                    throw new PrestoException(HiveErrorCode.HIVE_WRITE_VALIDATION_FAILED, e);
                }
            });
        }
        Callable<Void> rollbackAction = () -> {
            fileSystem.delete(path, false);
            return null;
        };
        return Optional.of(new OrcFileWriter(orcDataSink, rollbackAction, fileColumnNames, fileColumnTypes, dataFileColumnTypes, compression, orcWriterOptions.withStripeMinSize(HiveSessionProperties.getOrcOptimizedWriterMinStripeSize(session)).withStripeMaxSize(HiveSessionProperties.getOrcOptimizedWriterMaxStripeSize(session)).withStripeMaxRowCount(HiveSessionProperties.getOrcOptimizedWriterMaxStripeRows(session)).withDictionaryMaxMemory(HiveSessionProperties.getOrcOptimizedWriterMaxDictionaryMemory(session)).withMaxStringStatisticsLimit(HiveSessionProperties.getOrcStringStatisticsLimit(session)), writeLegacyVersion, fileInputColumnIndexes, ImmutableMap.<String, String>builder().put(HiveMetadata.PRESTO_VERSION_NAME, nodeVersion.toString()).put(HiveMetadata.PRESTO_QUERY_ID_NAME, session.getQueryId()).put("hive.acid.version", String.valueOf(AcidUtils.OrcAcidVersion.ORC_ACID_VERSION)).build(), validationInputFactory, HiveSessionProperties.getOrcOptimizedWriterValidateMode(session), stats, acidOptions, acidWriteType, deleteDeltaWriter, path));
    } catch (IOException e) {
        throw new PrestoException(HiveErrorCode.HIVE_WRITER_OPEN_ERROR, "Error creating ORC file", e);
    }
}
Also used : StorageFormat(io.prestosql.plugin.hive.metastore.StorageFormat) FileSystem(org.apache.hadoop.fs.FileSystem) Flatten(org.weakref.jmx.Flatten) HiveUtil.getColumnTypes(io.prestosql.plugin.hive.HiveUtil.getColumnTypes) Callable(java.util.concurrent.Callable) INTEGER(io.prestosql.spi.type.IntegerType.INTEGER) FileStatus(org.apache.hadoop.fs.FileStatus) Supplier(java.util.function.Supplier) Inject(javax.inject.Inject) ImmutableList(com.google.common.collect.ImmutableList) ConnectorSession(io.prestosql.spi.connector.ConnectorSession) Managed(org.weakref.jmx.Managed) OrcDataSink(io.prestosql.orc.OrcDataSink) OrcConf(org.apache.orc.OrcConf) HiveUtil.getColumnNames(io.prestosql.plugin.hive.HiveUtil.getColumnNames) Objects.requireNonNull(java.util.Objects.requireNonNull) RowType(io.prestosql.spi.type.RowType) OrcPageSourceFactory(io.prestosql.plugin.hive.orc.OrcPageSourceFactory) Path(org.apache.hadoop.fs.Path) Type(io.prestosql.spi.type.Type) BIGINT(io.prestosql.spi.type.BigintType.BIGINT) ENGLISH(java.util.Locale.ENGLISH) PrestoException(io.prestosql.spi.PrestoException) Properties(java.util.Properties) ImmutableMap(com.google.common.collect.ImmutableMap) TypeManager(io.prestosql.spi.type.TypeManager) AcidOutputFormat(org.apache.hadoop.hive.ql.io.AcidOutputFormat) OrcWriterStats(io.prestosql.orc.OrcWriterStats) IOException(java.io.IOException) OrcDataSource(io.prestosql.orc.OrcDataSource) OrcWriterOptions(io.prestosql.orc.OrcWriterOptions) JobConf(org.apache.hadoop.mapred.JobConf) List(java.util.List) Collectors.toList(java.util.stream.Collectors.toList) OutputStreamOrcDataSink(io.prestosql.orc.OutputStreamOrcDataSink) CompressionKind(io.prestosql.orc.metadata.CompressionKind) HdfsOrcDataSource(io.prestosql.plugin.hive.orc.HdfsOrcDataSource) Optional(java.util.Optional) OrcOutputFormat(org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat) AcidUtils(org.apache.hadoop.hive.ql.io.AcidUtils) OrcDataSourceId(io.prestosql.orc.OrcDataSourceId) FileStatus(org.apache.hadoop.fs.FileStatus) ImmutableList(com.google.common.collect.ImmutableList) OrcDataSink(io.prestosql.orc.OrcDataSink) OutputStreamOrcDataSink(io.prestosql.orc.OutputStreamOrcDataSink) RowType(io.prestosql.spi.type.RowType) HdfsOrcDataSource(io.prestosql.plugin.hive.orc.HdfsOrcDataSource) PrestoException(io.prestosql.spi.PrestoException) AcidOutputFormat(org.apache.hadoop.hive.ql.io.AcidOutputFormat) FileSystem(org.apache.hadoop.fs.FileSystem) Supplier(java.util.function.Supplier) Path(org.apache.hadoop.fs.Path) CompressionKind(io.prestosql.orc.metadata.CompressionKind) OrcDataSourceId(io.prestosql.orc.OrcDataSourceId) IOException(java.io.IOException) RowType(io.prestosql.spi.type.RowType) Type(io.prestosql.spi.type.Type)

Aggregations

CompressionKind (io.prestosql.orc.metadata.CompressionKind)4 PrestoException (io.prestosql.spi.PrestoException)2 OrcOutputFormat (org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat)2 ImmutableList (com.google.common.collect.ImmutableList)1 ImmutableMap (com.google.common.collect.ImmutableMap)1 Slice (io.airlift.slice.Slice)1 OrcDataSink (io.prestosql.orc.OrcDataSink)1 OrcDataSource (io.prestosql.orc.OrcDataSource)1 OrcDataSourceId (io.prestosql.orc.OrcDataSourceId)1 OrcWriterOptions (io.prestosql.orc.OrcWriterOptions)1 OrcWriterStats (io.prestosql.orc.OrcWriterStats)1 OutputStreamOrcDataSink (io.prestosql.orc.OutputStreamOrcDataSink)1 ExceptionWrappingMetadataReader (io.prestosql.orc.metadata.ExceptionWrappingMetadataReader)1 MetadataReader (io.prestosql.orc.metadata.MetadataReader)1 OrcColumnId (io.prestosql.orc.metadata.OrcColumnId)1 OrcMetadataReader (io.prestosql.orc.metadata.OrcMetadataReader)1 PostScript (io.prestosql.orc.metadata.PostScript)1 OrcInputStream (io.prestosql.orc.stream.OrcInputStream)1 HiveUtil.getColumnNames (io.prestosql.plugin.hive.HiveUtil.getColumnNames)1 HiveUtil.getColumnTypes (io.prestosql.plugin.hive.HiveUtil.getColumnTypes)1