Search in sources :

Example 1 with CompressionKind

use of com.facebook.presto.orc.metadata.CompressionKind in project presto by prestodb.

the class OrcTester method assertRoundTrip.

private void assertRoundTrip(List<Type> writeTypes, List<Type> readTypes, List<List<?>> writeValues, List<List<?>> readValues, boolean verifyWithHiveReader, List<OrcReaderSettings> settings) throws Exception {
    assertEquals(writeTypes.size(), readTypes.size());
    assertEquals(writeTypes.size(), writeValues.size());
    assertEquals(writeTypes.size(), readValues.size());
    OrcWriterStats stats = new OrcWriterStats();
    for (Format format : formats) {
        if (!readTypes.stream().allMatch(readType -> format.supportsType(readType))) {
            return;
        }
        OrcEncoding orcEncoding = format.getOrcEncoding();
        for (CompressionKind compression : compressions) {
            boolean hiveSupported = (compression != LZ4) && (compression != ZSTD);
            // write Hive, read Presto
            if (hiveSupported) {
                try (TempFile tempFile = new TempFile()) {
                    writeOrcColumnsHive(tempFile.getFile(), format, compression, writeTypes, writeValues);
                    assertFileContentsPresto(readTypes, tempFile, readValues, false, false, orcEncoding, format, true, useSelectiveOrcReader, settings, ImmutableMap.of());
                }
            }
            // write Presto, read Hive and Presto
            try (TempFile tempFile = new TempFile()) {
                writeOrcColumnsPresto(tempFile.getFile(), format, compression, Optional.empty(), writeTypes, writeValues, stats);
                if (verifyWithHiveReader && hiveSupported) {
                    assertFileContentsHive(readTypes, tempFile, format, readValues);
                }
                assertFileContentsPresto(readTypes, tempFile, readValues, false, false, orcEncoding, format, false, useSelectiveOrcReader, settings, ImmutableMap.of());
                if (skipBatchTestsEnabled) {
                    assertFileContentsPresto(readTypes, tempFile, readValues, true, false, orcEncoding, format, false, useSelectiveOrcReader, settings, ImmutableMap.of());
                }
                if (skipStripeTestsEnabled) {
                    assertFileContentsPresto(readTypes, tempFile, readValues, false, true, orcEncoding, format, false, useSelectiveOrcReader, settings, ImmutableMap.of());
                }
            }
            // write presto read presto
            if (dwrfEncryptionEnabled && format == DWRF) {
                try (TempFile tempFile = new TempFile()) {
                    DwrfWriterEncryption dwrfWriterEncryption = generateWriterEncryption();
                    writeOrcColumnsPresto(tempFile.getFile(), format, compression, Optional.of(dwrfWriterEncryption), writeTypes, writeValues, stats);
                    ImmutableMap.Builder<Integer, Slice> intermediateKeysBuilder = ImmutableMap.builder();
                    for (int i = 0; i < dwrfWriterEncryption.getWriterEncryptionGroups().size(); i++) {
                        for (Integer node : dwrfWriterEncryption.getWriterEncryptionGroups().get(i).getNodes()) {
                            intermediateKeysBuilder.put(node, dwrfWriterEncryption.getWriterEncryptionGroups().get(i).getIntermediateKeyMetadata());
                        }
                    }
                    Map<Integer, Slice> intermediateKeysMap = intermediateKeysBuilder.build();
                    assertFileContentsPresto(readTypes, tempFile, readValues, false, false, orcEncoding, format, false, useSelectiveOrcReader, settings, intermediateKeysMap);
                    if (skipBatchTestsEnabled) {
                        assertFileContentsPresto(readTypes, tempFile, readValues, true, false, orcEncoding, format, false, useSelectiveOrcReader, settings, intermediateKeysMap);
                    }
                    if (skipStripeTestsEnabled) {
                        assertFileContentsPresto(readTypes, tempFile, readValues, false, true, orcEncoding, format, false, useSelectiveOrcReader, settings, intermediateKeysMap);
                    }
                }
            }
        }
    }
    assertEquals(stats.getWriterSizeInBytes(), 0);
}
Also used : OrcUtil(org.apache.hadoop.hive.ql.io.orc.OrcUtil) Page(com.facebook.presto.common.Page) DateTimeZone(org.joda.time.DateTimeZone) Arrays(java.util.Arrays) ZonedDateTime(java.time.ZonedDateTime) PrimitiveObjectInspectorFactory.javaByteObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaByteObjectInspector) Text(org.apache.hadoop.io.Text) ORC_11(com.facebook.presto.orc.OrcTester.Format.ORC_11) PrimitiveObjectInspectorFactory.javaLongObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaLongObjectInspector) DateWritable(org.apache.hadoop.hive.serde2.io.DateWritable) Writable(org.apache.hadoop.io.Writable) ORC_12(com.facebook.presto.orc.OrcTester.Format.ORC_12) PrimitiveObjectInspectorFactory.javaTimestampObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaTimestampObjectInspector) Unit(io.airlift.units.DataSize.Unit) PrimitiveObjectInspectorFactory.javaDateObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaDateObjectInspector) HIVE_ORC_DICTIONARY_ENCODING_INTERVAL(com.facebook.hive.orc.OrcConf.ConfVars.HIVE_ORC_DICTIONARY_ENCODING_INTERVAL) HiveChar(org.apache.hadoop.hive.common.type.HiveChar) OrcStruct(org.apache.hadoop.hive.ql.io.orc.OrcStruct) BigDecimal(java.math.BigDecimal) Chars.truncateToLengthAndTrimSpaces(com.facebook.presto.common.type.Chars.truncateToLengthAndTrimSpaces) Arrays.asList(java.util.Arrays.asList) Slices(io.airlift.slice.Slices) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) StripeInformation(com.facebook.presto.orc.metadata.StripeInformation) BigInteger(java.math.BigInteger) Slices.utf8Slice(io.airlift.slice.Slices.utf8Slice) ObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector) Assert.assertFalse(org.testng.Assert.assertFalse) IntWritable(org.apache.hadoop.io.IntWritable) PrimitiveObjectInspectorFactory.javaByteArrayObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaByteArrayObjectInspector) PrimitiveObjectInspectorFactory.javaFloatObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaFloatObjectInspector) BlockBuilder(com.facebook.presto.common.block.BlockBuilder) PrimitiveObjectInspectorFactory.javaDoubleObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaDoubleObjectInspector) Footer(com.facebook.presto.orc.metadata.Footer) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) Set(java.util.Set) READ_ALL_COLUMNS(org.apache.hadoop.hive.serde2.ColumnProjectionUtils.READ_ALL_COLUMNS) ReaderOptions(org.apache.hadoop.hive.ql.io.orc.OrcFile.ReaderOptions) ZoneId(java.time.ZoneId) UncheckedIOException(java.io.UncheckedIOException) ImmutableMap.toImmutableMap(com.google.common.collect.ImmutableMap.toImmutableMap) BooleanWritable(org.apache.hadoop.io.BooleanWritable) Lists.newArrayList(com.google.common.collect.Lists.newArrayList) SqlTimestamp(com.facebook.presto.common.type.SqlTimestamp) Decimals.rescale(com.facebook.presto.common.type.Decimals.rescale) IS_NOT_NULL(com.facebook.presto.common.predicate.TupleDomainFilter.IS_NOT_NULL) INTEGER(com.facebook.presto.common.type.IntegerType.INTEGER) IS_NULL(com.facebook.presto.common.predicate.TupleDomainFilter.IS_NULL) OutputStreamDataSink(com.facebook.presto.common.io.OutputStreamDataSink) OrcInputStream(com.facebook.presto.orc.stream.OrcInputStream) PrimitiveObjectInspectorFactory.javaIntObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaIntObjectInspector) StructField(org.apache.hadoop.hive.serde2.objectinspector.StructField) HIVE_ORC_COMPRESSION(com.facebook.hive.orc.OrcConf.ConfVars.HIVE_ORC_COMPRESSION) TestBigintRange(com.facebook.presto.orc.TrackingTupleDomainFilter.TestBigintRange) JavaHiveCharObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.JavaHiveCharObjectInspector) Iterables(com.google.common.collect.Iterables) StandardTypes(com.facebook.presto.common.type.StandardTypes) DecimalType(com.facebook.presto.common.type.DecimalType) Slice(io.airlift.slice.Slice) TypeSignatureParameter(com.facebook.presto.common.type.TypeSignatureParameter) TINYINT(com.facebook.presto.common.type.TinyintType.TINYINT) MEGABYTE(io.airlift.units.DataSize.Unit.MEGABYTE) DataSize.succinctBytes(io.airlift.units.DataSize.succinctBytes) TIMESTAMP(com.facebook.presto.common.type.TimestampType.TIMESTAMP) OrcLazyObject(com.facebook.hive.orc.lazy.OrcLazyObject) FunctionAndTypeManager.createTestFunctionAndTypeManager(com.facebook.presto.metadata.FunctionAndTypeManager.createTestFunctionAndTypeManager) HiveCharWritable(org.apache.hadoop.hive.serde2.io.HiveCharWritable) DATE(com.facebook.presto.common.type.DateType.DATE) REAL(com.facebook.presto.common.type.RealType.REAL) ArrayList(java.util.ArrayList) UNKNOWN(com.facebook.presto.orc.metadata.KeyProvider.UNKNOWN) SqlDate(com.facebook.presto.common.type.SqlDate) Lists(com.google.common.collect.Lists) PrimitiveObjectInspectorFactory.javaShortObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaShortObjectInspector) HIVE_ORC_BUILD_STRIDE_DICTIONARY(com.facebook.hive.orc.OrcConf.ConfVars.HIVE_ORC_BUILD_STRIDE_DICTIONARY) SqlVarbinary(com.facebook.presto.common.type.SqlVarbinary) BOOLEAN(com.facebook.presto.common.type.BooleanType.BOOLEAN) ImmutableSet.toImmutableSet(com.google.common.collect.ImmutableSet.toImmutableSet) ArrayType(com.facebook.presto.common.type.ArrayType) CharType(com.facebook.presto.common.type.CharType) MAX_BATCH_SIZE(com.facebook.presto.orc.OrcReader.MAX_BATCH_SIZE) Functions(com.google.common.base.Functions) BIGINT(com.facebook.presto.common.type.BigintType.BIGINT) Properties(java.util.Properties) NOOP_ORC_LOCAL_MEMORY_CONTEXT(com.facebook.presto.orc.NoopOrcLocalMemoryContext.NOOP_ORC_LOCAL_MEMORY_CONTEXT) SharedBuffer(com.facebook.presto.orc.stream.SharedBuffer) AbstractIterator(com.google.common.collect.AbstractIterator) FileOutputStream(java.io.FileOutputStream) SNAPPY(com.facebook.presto.orc.metadata.CompressionKind.SNAPPY) IOException(java.io.IOException) ObjectInspectorFactory.getStandardStructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory.getStandardStructObjectInspector) Field(java.lang.reflect.Field) DecimalTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo) File(java.io.File) SettableStructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.SettableStructObjectInspector) SMALLINT(com.facebook.presto.common.type.SmallintType.SMALLINT) LZ4(com.facebook.presto.orc.metadata.CompressionKind.LZ4) FloatWritable(org.apache.hadoop.io.FloatWritable) RowType(com.facebook.presto.common.type.RowType) ZSTD(com.facebook.presto.orc.metadata.CompressionKind.ZSTD) FunctionAndTypeManager(com.facebook.presto.metadata.FunctionAndTypeManager) RecordWriter(org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter) OrcFile(org.apache.hadoop.hive.ql.io.orc.OrcFile) BOTH(com.facebook.presto.orc.OrcWriteValidation.OrcWriteValidationMode.BOTH) BigintRange(com.facebook.presto.common.predicate.TupleDomainFilter.BigintRange) LongWritable(org.apache.hadoop.io.LongWritable) OrcSerde(org.apache.hadoop.hive.ql.io.orc.OrcSerde) Preconditions.checkArgument(com.google.common.base.Preconditions.checkArgument) TestDoubleRange(com.facebook.presto.orc.TrackingTupleDomainFilter.TestDoubleRange) OrcConf(org.apache.orc.OrcConf) SESSION(com.facebook.presto.testing.TestingConnectorSession.SESSION) Path(org.apache.hadoop.fs.Path) Reader(org.apache.hadoop.hive.ql.io.orc.Reader) RuntimeStats(com.facebook.presto.common.RuntimeStats) SqlDecimal(com.facebook.presto.common.type.SqlDecimal) ShortWritable(org.apache.hadoop.hive.serde2.io.ShortWritable) ImmutableSet(com.google.common.collect.ImmutableSet) ImmutableMap(com.google.common.collect.ImmutableMap) DOUBLE(com.facebook.presto.common.type.DoubleType.DOUBLE) NONE(com.facebook.presto.orc.metadata.CompressionKind.NONE) Timestamp(java.sql.Timestamp) VarcharType(com.facebook.presto.common.type.VarcharType) Assert.assertNotNull(org.testng.Assert.assertNotNull) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector) Collectors(java.util.stream.Collectors) FileNotFoundException(java.io.FileNotFoundException) NOOP_ORC_AGGREGATED_MEMORY_CONTEXT(com.facebook.presto.orc.NoopOrcAggregatedMemoryContext.NOOP_ORC_AGGREGATED_MEMORY_CONTEXT) Objects(java.util.Objects) DataSize(io.airlift.units.DataSize) List(java.util.List) CompressionKind(com.facebook.presto.orc.metadata.CompressionKind) LocalDate(java.time.LocalDate) Entry(java.util.Map.Entry) VarbinaryType(com.facebook.presto.common.type.VarbinaryType) Optional(java.util.Optional) OrcDecompressor.createOrcDecompressor(com.facebook.presto.orc.OrcDecompressor.createOrcDecompressor) READ_COLUMN_IDS_CONF_STR(org.apache.hadoop.hive.serde2.ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR) OrcOutputFormat(org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat) IntStream(java.util.stream.IntStream) TypeInfoFactory.getCharTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory.getCharTypeInfo) MapType(com.facebook.presto.common.type.MapType) PrimitiveObjectInspectorFactory.javaBooleanObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaBooleanObjectInspector) VARCHAR(com.facebook.presto.common.type.VarcharType.VARCHAR) DateTimeTestingUtils.sqlTimestampOf(com.facebook.presto.testing.DateTimeTestingUtils.sqlTimestampOf) Assert.assertEquals(org.testng.Assert.assertEquals) HashMap(java.util.HashMap) HIVE_ORC_ENTROPY_STRING_THRESHOLD(com.facebook.hive.orc.OrcConf.ConfVars.HIVE_ORC_ENTROPY_STRING_THRESHOLD) DoubleWritable(org.apache.hadoop.io.DoubleWritable) Function(java.util.function.Function) PrimitiveObjectInspectorFactory.getPrimitiveJavaObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.getPrimitiveJavaObjectInspector) ZLIB(com.facebook.presto.orc.metadata.CompressionKind.ZLIB) Varchars.truncateToLength(com.facebook.presto.common.type.Varchars.truncateToLength) Subfield(com.facebook.presto.common.Subfield) ImmutableList(com.google.common.collect.ImmutableList) FilterFunction(com.facebook.presto.common.predicate.FilterFunction) ByteWritable(org.apache.hadoop.io.ByteWritable) Objects.requireNonNull(java.util.Objects.requireNonNull) BytesWritable(org.apache.hadoop.io.BytesWritable) TimestampWritable(org.apache.hadoop.hive.serde2.io.TimestampWritable) Math.toIntExact(java.lang.Math.toIntExact) OrcFileTailSource(com.facebook.presto.orc.cache.OrcFileTailSource) ObjectInspectorFactory.getStandardMapObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory.getStandardMapObjectInspector) Type(com.facebook.presto.common.type.Type) NamedTypeSignature(com.facebook.presto.common.type.NamedTypeSignature) Iterator(java.util.Iterator) Assert.fail(org.testng.Assert.fail) NO_ENCRYPTION(com.facebook.presto.orc.DwrfEncryptionProvider.NO_ENCRYPTION) Decimals(com.facebook.presto.common.type.Decimals) VARBINARY(com.facebook.presto.common.type.VarbinaryType.VARBINARY) Maps(com.google.common.collect.Maps) ObjectInspectorFactory.getStandardListObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory.getStandardListObjectInspector) Date(java.sql.Date) TupleDomainFilter(com.facebook.presto.common.predicate.TupleDomainFilter) JobConf(org.apache.hadoop.mapred.JobConf) DWRF(com.facebook.presto.orc.OrcTester.Format.DWRF) StorageOrcFileTailSource(com.facebook.presto.orc.cache.StorageOrcFileTailSource) Collectors.toList(java.util.stream.Collectors.toList) StripeFooter(com.facebook.presto.orc.metadata.StripeFooter) Serializer(org.apache.hadoop.hive.serde2.Serializer) HiveDecimal(org.apache.hadoop.hive.common.type.HiveDecimal) TestingOrcPredicate.createOrcPredicate(com.facebook.presto.orc.TestingOrcPredicate.createOrcPredicate) Assert.assertTrue(org.testng.Assert.assertTrue) RowFieldName(com.facebook.presto.common.type.RowFieldName) PrimitiveObjectInspectorFactory.javaStringObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaStringObjectInspector) Block(com.facebook.presto.common.block.Block) HiveDecimalWritable(org.apache.hadoop.hive.serde2.io.HiveDecimalWritable) DoubleRange(com.facebook.presto.common.predicate.TupleDomainFilter.DoubleRange) SECONDS(java.util.concurrent.TimeUnit.SECONDS) InputStream(java.io.InputStream) CompressionKind(com.facebook.presto.orc.metadata.CompressionKind) ImmutableMap.toImmutableMap(com.google.common.collect.ImmutableMap.toImmutableMap) ImmutableMap(com.google.common.collect.ImmutableMap) BigInteger(java.math.BigInteger) OrcOutputFormat(org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat) Slices.utf8Slice(io.airlift.slice.Slices.utf8Slice) Slice(io.airlift.slice.Slice)

Example 2 with CompressionKind

use of com.facebook.presto.orc.metadata.CompressionKind in project presto by prestodb.

the class TestOrcSelectiveStreamReaders method testEmptyStrings.

/**
 * This test tests SliceDirectSelectiveStreamReader for the case where all elements to read are empty strings. The output Block should be a valid VariableWidthBlock with an
 * empty Slice. It is to simulate a problem seen in production. The state of SliceDirectSelectiveStreamReader to reproduce the problem is:
 * - dataStream: null
 * - presentStream: null
 * - lengthStream: not null
 * - filter: null
 * - outputRequired: true
 * - offsets array: non zeros
 * The test issues two reads, the first one reads a non-empty string and populates non-zero offsets. The second one reads the empty string with the above conditions met.
 */
@Test
public void testEmptyStrings() throws Exception {
    Type type = VARCHAR;
    List<Type> types = ImmutableList.of(type);
    List<List<?>> values = ImmutableList.of(ImmutableList.of("a", ""));
    for (OrcTester.Format format : formats) {
        if (!types.stream().allMatch(readType -> format.supportsType(readType))) {
            return;
        }
        for (CompressionKind compression : compressions) {
            TempFile tempFile = new TempFile();
            writeOrcColumnsPresto(tempFile.getFile(), format, compression, Optional.empty(), types, values, new OrcWriterStats());
            OrcPredicate orcPredicate = createOrcPredicate(types, values, DWRF, false);
            Map<Integer, Type> includedColumns = IntStream.range(0, types.size()).boxed().collect(toImmutableMap(Function.identity(), types::get));
            List<Integer> outputColumns = IntStream.range(0, types.size()).boxed().collect(toImmutableList());
            OrcAggregatedMemoryContext systemMemoryUsage = new TestingHiveOrcAggregatedMemoryContext();
            try (OrcSelectiveRecordReader recordReader = createCustomOrcSelectiveRecordReader(tempFile.getFile(), format.getOrcEncoding(), orcPredicate, types, 1, ImmutableMap.of(), ImmutableList.of(), ImmutableMap.of(), OrcTester.OrcReaderSettings.builder().build().getRequiredSubfields(), ImmutableMap.of(), ImmutableMap.of(), includedColumns, outputColumns, false, systemMemoryUsage, false)) {
                assertEquals(recordReader.getReaderPosition(), 0);
                assertEquals(recordReader.getFilePosition(), 0);
                SelectiveStreamReader streamReader = recordReader.getStreamReaders()[0];
                // Read the first non-empty element. Do not call streamReader.getBlock() to preserve the offsets array in SliceDirectSelectiveStreamReader.
                int batchSize = min(recordReader.prepareNextBatch(), 1);
                int[] positions = IntStream.range(0, batchSize).toArray();
                streamReader.read(0, positions, batchSize);
                recordReader.batchRead(batchSize);
                // Read the second element: an empty string. Set the dataStream in SliceDirectSelectiveStreamReader to null to simulate the conditions causing the problem.
                ((SliceSelectiveStreamReader) streamReader).resetDataStream();
                batchSize = min(recordReader.prepareNextBatch(), 1);
                positions = IntStream.range(0, batchSize).toArray();
                streamReader.read(0, positions, batchSize);
                recordReader.batchRead(batchSize);
                Block block = streamReader.getBlock(positions, batchSize);
                List<?> expectedValues = ImmutableList.of("");
                assertBlockEquals(type, block, expectedValues, 0);
                assertEquals(recordReader.getReaderPosition(), 1);
                assertEquals(recordReader.getFilePosition(), 1);
            }
        }
    }
}
Also used : IntStream(java.util.stream.IntStream) OrcTester.createCustomOrcSelectiveRecordReader(com.facebook.presto.orc.OrcTester.createCustomOrcSelectiveRecordReader) VARCHAR(com.facebook.presto.common.type.VarcharType.VARCHAR) SliceSelectiveStreamReader(com.facebook.presto.orc.reader.SliceSelectiveStreamReader) ORC_11(com.facebook.presto.orc.OrcTester.Format.ORC_11) Assert.assertEquals(org.testng.Assert.assertEquals) Test(org.testng.annotations.Test) ORC_12(com.facebook.presto.orc.OrcTester.Format.ORC_12) Function(java.util.function.Function) OrcTester.assertBlockEquals(com.facebook.presto.orc.OrcTester.assertBlockEquals) ZLIB(com.facebook.presto.orc.metadata.CompressionKind.ZLIB) ImmutableList(com.google.common.collect.ImmutableList) Map(java.util.Map) OrcTester.writeOrcColumnsPresto(com.facebook.presto.orc.OrcTester.writeOrcColumnsPresto) Type(com.facebook.presto.common.type.Type) ImmutableSet(com.google.common.collect.ImmutableSet) ImmutableMap(com.google.common.collect.ImmutableMap) NONE(com.facebook.presto.orc.metadata.CompressionKind.NONE) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) Set(java.util.Set) SNAPPY(com.facebook.presto.orc.metadata.CompressionKind.SNAPPY) Math.min(java.lang.Math.min) DWRF(com.facebook.presto.orc.OrcTester.Format.DWRF) List(java.util.List) ImmutableMap.toImmutableMap(com.google.common.collect.ImmutableMap.toImmutableMap) SelectiveStreamReader(com.facebook.presto.orc.reader.SelectiveStreamReader) CompressionKind(com.facebook.presto.orc.metadata.CompressionKind) LZ4(com.facebook.presto.orc.metadata.CompressionKind.LZ4) TestingOrcPredicate.createOrcPredicate(com.facebook.presto.orc.TestingOrcPredicate.createOrcPredicate) Optional(java.util.Optional) Block(com.facebook.presto.common.block.Block) ZSTD(com.facebook.presto.orc.metadata.CompressionKind.ZSTD) OrcTester.createCustomOrcSelectiveRecordReader(com.facebook.presto.orc.OrcTester.createCustomOrcSelectiveRecordReader) CompressionKind(com.facebook.presto.orc.metadata.CompressionKind) Type(com.facebook.presto.common.type.Type) SliceSelectiveStreamReader(com.facebook.presto.orc.reader.SliceSelectiveStreamReader) SelectiveStreamReader(com.facebook.presto.orc.reader.SelectiveStreamReader) Block(com.facebook.presto.common.block.Block) ImmutableList(com.google.common.collect.ImmutableList) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) List(java.util.List) SliceSelectiveStreamReader(com.facebook.presto.orc.reader.SliceSelectiveStreamReader) TestingOrcPredicate.createOrcPredicate(com.facebook.presto.orc.TestingOrcPredicate.createOrcPredicate) Test(org.testng.annotations.Test)

Example 3 with CompressionKind

use of com.facebook.presto.orc.metadata.CompressionKind in project presto by prestodb.

the class OrcFileWriterFactory method getCompression.

private static CompressionKind getCompression(Properties schema, JobConf configuration, OrcEncoding orcEncoding) {
    String compressionName = OrcConf.COMPRESS.getString(schema, configuration);
    if (compressionName == null) {
        return CompressionKind.ZLIB;
    }
    CompressionKind compression;
    try {
        compression = CompressionKind.valueOf(compressionName.toUpperCase(ENGLISH));
    } catch (IllegalArgumentException e) {
        throw new PrestoException(HIVE_UNSUPPORTED_FORMAT, "Unknown " + orcEncoding + " compression type " + compressionName);
    }
    return compression;
}
Also used : CompressionKind(com.facebook.presto.orc.metadata.CompressionKind) PrestoException(com.facebook.presto.spi.PrestoException)

Example 4 with CompressionKind

use of com.facebook.presto.orc.metadata.CompressionKind in project presto by prestodb.

the class OrcFileWriterFactory method createFileWriter.

@Override
public Optional<HiveFileWriter> createFileWriter(Path path, List<String> inputColumnNames, StorageFormat storageFormat, Properties schema, JobConf configuration, ConnectorSession session, Optional<EncryptionInformation> encryptionInformation) {
    if (!HiveSessionProperties.isOrcOptimizedWriterEnabled(session)) {
        return Optional.empty();
    }
    OrcEncoding orcEncoding;
    if (OrcOutputFormat.class.getName().equals(storageFormat.getOutputFormat())) {
        orcEncoding = ORC;
    } else if (com.facebook.hive.orc.OrcOutputFormat.class.getName().equals(storageFormat.getOutputFormat())) {
        orcEncoding = DWRF;
    } else {
        return Optional.empty();
    }
    CompressionKind compression = getCompression(schema, configuration, orcEncoding);
    // existing tables and partitions may have columns in a different order than the writer is providing, so build
    // an index to rearrange columns in the proper order
    List<String> fileColumnNames = Splitter.on(',').trimResults().omitEmptyStrings().splitToList(schema.getProperty(META_TABLE_COLUMNS, ""));
    List<Type> fileColumnTypes = toHiveTypes(schema.getProperty(META_TABLE_COLUMN_TYPES, "")).stream().map(hiveType -> hiveType.getType(typeManager)).collect(toList());
    int[] fileInputColumnIndexes = fileColumnNames.stream().mapToInt(inputColumnNames::indexOf).toArray();
    try {
        FileSystem fileSystem = hdfsEnvironment.getFileSystem(session.getUser(), path, configuration);
        DataSink dataSink = createDataSink(session, fileSystem, path);
        Optional<Supplier<OrcDataSource>> validationInputFactory = Optional.empty();
        if (HiveSessionProperties.isOrcOptimizedWriterValidate(session)) {
            validationInputFactory = Optional.of(() -> {
                try {
                    return new HdfsOrcDataSource(new OrcDataSourceId(path.toString()), fileSystem.getFileStatus(path).getLen(), getOrcMaxMergeDistance(session), getOrcMaxBufferSize(session), getOrcStreamBufferSize(session), false, fileSystem.open(path), readStats);
                } catch (IOException e) {
                    throw new PrestoException(HIVE_WRITE_VALIDATION_FAILED, e);
                }
            });
        }
        Callable<Void> rollbackAction = () -> {
            fileSystem.delete(path, false);
            return null;
        };
        Optional<DwrfWriterEncryption> dwrfWriterEncryption = createDwrfEncryption(encryptionInformation, fileColumnNames, fileColumnTypes);
        return Optional.of(new OrcFileWriter(dataSink, rollbackAction, orcEncoding, fileColumnNames, fileColumnTypes, compression, orcFileWriterConfig.toOrcWriterOptionsBuilder().withFlushPolicy(DefaultOrcWriterFlushPolicy.builder().withStripeMinSize(getOrcOptimizedWriterMinStripeSize(session)).withStripeMaxSize(getOrcOptimizedWriterMaxStripeSize(session)).withStripeMaxRowCount(getOrcOptimizedWriterMaxStripeRows(session)).build()).withDictionaryMaxMemory(getOrcOptimizedWriterMaxDictionaryMemory(session)).withMaxStringStatisticsLimit(getOrcStringStatisticsLimit(session)).withIgnoreDictionaryRowGroupSizes(isExecutionBasedMemoryAccountingEnabled(session)).withDwrfStripeCacheEnabled(isDwrfWriterStripeCacheEnabled(session)).withDwrfStripeCacheMaxSize(getDwrfWriterStripeCacheeMaxSize(session)).build(), fileInputColumnIndexes, ImmutableMap.<String, String>builder().put(HiveMetadata.PRESTO_VERSION_NAME, nodeVersion.toString()).put(MetastoreUtil.PRESTO_QUERY_ID_NAME, session.getQueryId()).build(), hiveStorageTimeZone, validationInputFactory, getOrcOptimizedWriterValidateMode(session), stats, dwrfEncryptionProvider, dwrfWriterEncryption));
    } catch (IOException e) {
        throw new PrestoException(HIVE_WRITER_OPEN_ERROR, "Error creating " + orcEncoding + " file. " + e.getMessage(), e);
    }
}
Also used : HdfsOrcDataSource(com.facebook.presto.hive.orc.HdfsOrcDataSource) DateTimeZone(org.joda.time.DateTimeZone) FileSystem(org.apache.hadoop.fs.FileSystem) HiveSessionProperties.getDwrfWriterStripeCacheeMaxSize(com.facebook.presto.hive.HiveSessionProperties.getDwrfWriterStripeCacheeMaxSize) HIVE_WRITE_VALIDATION_FAILED(com.facebook.presto.hive.HiveErrorCode.HIVE_WRITE_VALIDATION_FAILED) DataSink(com.facebook.presto.common.io.DataSink) HiveSessionProperties.getOrcOptimizedWriterMaxDictionaryMemory(com.facebook.presto.hive.HiveSessionProperties.getOrcOptimizedWriterMaxDictionaryMemory) OrcConf(org.apache.orc.OrcConf) HiveSessionProperties.getOrcOptimizedWriterMinStripeSize(com.facebook.presto.hive.HiveSessionProperties.getOrcOptimizedWriterMinStripeSize) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) HiveSessionProperties.getOrcOptimizedWriterValidateMode(com.facebook.presto.hive.HiveSessionProperties.getOrcOptimizedWriterValidateMode) OrcDataSource(com.facebook.presto.orc.OrcDataSource) Splitter(com.google.common.base.Splitter) ENGLISH(java.util.Locale.ENGLISH) WriterEncryptionGroup(com.facebook.presto.orc.WriterEncryptionGroup) CRYPTO_SERVICE(com.facebook.presto.orc.metadata.KeyProvider.CRYPTO_SERVICE) HIVE_UNSUPPORTED_FORMAT(com.facebook.presto.hive.HiveErrorCode.HIVE_UNSUPPORTED_FORMAT) KeyProvider(com.facebook.presto.orc.metadata.KeyProvider) META_TABLE_COLUMN_TYPES(org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.META_TABLE_COLUMN_TYPES) StorageFormat(com.facebook.presto.hive.metastore.StorageFormat) ImmutableMap(com.google.common.collect.ImmutableMap) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) HiveSessionProperties.isDwrfWriterStripeCacheEnabled(com.facebook.presto.hive.HiveSessionProperties.isDwrfWriterStripeCacheEnabled) HiveSessionProperties.isExecutionBasedMemoryAccountingEnabled(com.facebook.presto.hive.HiveSessionProperties.isExecutionBasedMemoryAccountingEnabled) ConnectorSession(com.facebook.presto.spi.ConnectorSession) ORC(com.facebook.presto.orc.OrcEncoding.ORC) List(java.util.List) ImmutableMap.toImmutableMap(com.google.common.collect.ImmutableMap.toImmutableMap) DWRF(com.facebook.presto.orc.OrcEncoding.DWRF) CompressionKind(com.facebook.presto.orc.metadata.CompressionKind) ImmutableListMultimap(com.google.common.collect.ImmutableListMultimap) Optional(java.util.Optional) OrcOutputFormat(org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat) HiveType.toHiveTypes(com.facebook.presto.hive.HiveType.toHiveTypes) IntStream(java.util.stream.IntStream) HiveSessionProperties.getOrcOptimizedWriterMaxStripeSize(com.facebook.presto.hive.HiveSessionProperties.getOrcOptimizedWriterMaxStripeSize) Slice(io.airlift.slice.Slice) HiveSessionProperties.getOrcMaxMergeDistance(com.facebook.presto.hive.HiveSessionProperties.getOrcMaxMergeDistance) Flatten(org.weakref.jmx.Flatten) Callable(java.util.concurrent.Callable) META_TABLE_COLUMNS(org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.META_TABLE_COLUMNS) DwrfWriterEncryption(com.facebook.presto.orc.DwrfWriterEncryption) DataSinkFactory(com.facebook.presto.hive.datasink.DataSinkFactory) PrestoException(com.facebook.presto.spi.PrestoException) Supplier(java.util.function.Supplier) UNKNOWN(com.facebook.presto.orc.metadata.KeyProvider.UNKNOWN) Inject(javax.inject.Inject) MetastoreUtil(com.facebook.presto.hive.metastore.MetastoreUtil) Managed(org.weakref.jmx.Managed) TypeManager(com.facebook.presto.common.type.TypeManager) HiveSessionProperties.getOrcMaxBufferSize(com.facebook.presto.hive.HiveSessionProperties.getOrcMaxBufferSize) Objects.requireNonNull(java.util.Objects.requireNonNull) OrcType(com.facebook.presto.orc.metadata.OrcType) OrcWriterStats(com.facebook.presto.orc.OrcWriterStats) Type(com.facebook.presto.common.type.Type) DwrfEncryptionProvider(com.facebook.presto.orc.DwrfEncryptionProvider) OrcDataSourceId(com.facebook.presto.orc.OrcDataSourceId) OrcEncoding(com.facebook.presto.orc.OrcEncoding) Properties(java.util.Properties) DefaultOrcWriterFlushPolicy(com.facebook.presto.orc.DefaultOrcWriterFlushPolicy) HiveSessionProperties.getOrcStreamBufferSize(com.facebook.presto.hive.HiveSessionProperties.getOrcStreamBufferSize) IOException(java.io.IOException) HiveSessionProperties.getOrcOptimizedWriterMaxStripeRows(com.facebook.presto.hive.HiveSessionProperties.getOrcOptimizedWriterMaxStripeRows) JobConf(org.apache.hadoop.mapred.JobConf) Collectors.toList(java.util.stream.Collectors.toList) HiveSessionProperties.getOrcStringStatisticsLimit(com.facebook.presto.hive.HiveSessionProperties.getOrcStringStatisticsLimit) HIVE_WRITER_OPEN_ERROR(com.facebook.presto.hive.HiveErrorCode.HIVE_WRITER_OPEN_ERROR) DataSink(com.facebook.presto.common.io.DataSink) CompressionKind(com.facebook.presto.orc.metadata.CompressionKind) OrcDataSourceId(com.facebook.presto.orc.OrcDataSourceId) HdfsOrcDataSource(com.facebook.presto.hive.orc.HdfsOrcDataSource) PrestoException(com.facebook.presto.spi.PrestoException) OrcEncoding(com.facebook.presto.orc.OrcEncoding) IOException(java.io.IOException) OrcOutputFormat(org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat) OrcType(com.facebook.presto.orc.metadata.OrcType) Type(com.facebook.presto.common.type.Type) FileSystem(org.apache.hadoop.fs.FileSystem) DwrfWriterEncryption(com.facebook.presto.orc.DwrfWriterEncryption) Supplier(java.util.function.Supplier)

Example 5 with CompressionKind

use of com.facebook.presto.orc.metadata.CompressionKind in project presto by prestodb.

the class StorageOrcFileTailSource method getOrcFileTail.

@Override
public OrcFileTail getOrcFileTail(OrcDataSource orcDataSource, MetadataReader metadataReader, Optional<OrcWriteValidation> writeValidation, boolean cacheable) throws IOException {
    long size = orcDataSource.getSize();
    if (size <= MAGIC.length()) {
        throw new OrcCorruptionException(orcDataSource.getId(), "Invalid file size %s", size);
    }
    // Read the tail of the file
    byte[] buffer = new byte[toIntExact(min(size, expectedFooterSizeInBytes))];
    orcDataSource.readFully(size - buffer.length, buffer);
    // get length of PostScript - last byte of the file
    int postScriptSize = buffer[buffer.length - SIZE_OF_BYTE] & 0xff;
    if (postScriptSize >= buffer.length) {
        throw new OrcCorruptionException(orcDataSource.getId(), "Invalid postscript length %s", postScriptSize);
    }
    // decode the post script
    PostScript postScript;
    try {
        postScript = metadataReader.readPostScript(buffer, buffer.length - SIZE_OF_BYTE - postScriptSize, postScriptSize);
    } catch (OrcCorruptionException e) {
        // check if this is an ORC file and not an RCFile or something else
        if (!isValidHeaderMagic(orcDataSource)) {
            throw new OrcCorruptionException(orcDataSource.getId(), "Not an ORC file");
        }
        throw e;
    }
    // verify this is a supported version
    checkOrcVersion(orcDataSource, postScript.getVersion());
    validateWrite(writeValidation, orcDataSource, validation -> validation.getVersion().equals(postScript.getVersion()), "Unexpected version");
    int bufferSize = toIntExact(postScript.getCompressionBlockSize());
    // check compression codec is supported
    CompressionKind compressionKind = postScript.getCompression();
    validateWrite(writeValidation, orcDataSource, validation -> validation.getCompression() == compressionKind, "Unexpected compression");
    PostScript.HiveWriterVersion hiveWriterVersion = postScript.getHiveWriterVersion();
    int footerSize = toIntExact(postScript.getFooterLength());
    int metadataSize = toIntExact(postScript.getMetadataLength());
    if (footerSize < 0) {
        throw new OrcCorruptionException(orcDataSource.getId(), "Invalid footer length %s", footerSize);
    }
    if (metadataSize < 0) {
        throw new OrcCorruptionException(orcDataSource.getId(), "Invalid metadata length %s", metadataSize);
    }
    // read DWRF stripe cache only if this feature is enabled and it has meaningful data
    boolean readDwrfStripeCache = dwrfStripeCacheEnabled && postScript.getDwrfStripeCacheLength().isPresent() && postScript.getDwrfStripeCacheMode().isPresent() && postScript.getDwrfStripeCacheMode().get() != DwrfStripeCacheMode.NONE;
    int dwrfStripeCacheSize = 0;
    if (readDwrfStripeCache) {
        dwrfStripeCacheSize = postScript.getDwrfStripeCacheLength().getAsInt();
        checkSizes(orcDataSource, metadataSize, dwrfStripeCacheSize);
    }
    // check if extra bytes need to be read
    Slice completeFooterSlice;
    int completeFooterSize = dwrfStripeCacheSize + metadataSize + footerSize + postScriptSize + SIZE_OF_BYTE;
    if (completeFooterSize > buffer.length) {
        // allocate a new buffer large enough for the complete footer
        byte[] newBuffer = new byte[completeFooterSize];
        completeFooterSlice = Slices.wrappedBuffer(newBuffer);
        // initial read was not large enough, so read missing section
        orcDataSource.readFully(size - completeFooterSize, newBuffer, 0, completeFooterSize - buffer.length);
        // copy already read bytes into the new buffer
        completeFooterSlice.setBytes(completeFooterSize - buffer.length, buffer);
    } else {
        // footer is already in the bytes in buffer, just adjust position, length
        completeFooterSlice = Slices.wrappedBuffer(buffer, buffer.length - completeFooterSize, completeFooterSize);
    }
    // metadataSize is set only for ORC files, dwrfStripeCacheSize is set only for DWRF files
    // it should be safe to sum them up to find footer offset
    // TAIL: [ ORC_METADATA{0,1} | DWRF_STRIPE_CACHE {0,1} ] + FOOTER + POST_SCRIPT + POST_SCRIPT_SIZE (1 byte)
    int footerSliceOffset = metadataSize + dwrfStripeCacheSize;
    Slice footerSlice = completeFooterSlice.slice(footerSliceOffset, footerSize);
    Slice metadataSlice = completeFooterSlice.slice(0, metadataSize);
    // set DwrfStripeCacheData only if the stripe cache feature is enabled and the file has the stripe cache
    Optional<DwrfStripeCacheData> dwrfStripeCacheData = Optional.empty();
    if (readDwrfStripeCache) {
        Slice dwrfStripeCacheSlice = completeFooterSlice.slice(0, dwrfStripeCacheSize);
        DwrfStripeCacheMode stripeCacheMode = postScript.getDwrfStripeCacheMode().get();
        dwrfStripeCacheData = Optional.of(new DwrfStripeCacheData(dwrfStripeCacheSlice, dwrfStripeCacheSize, stripeCacheMode));
    }
    return new OrcFileTail(hiveWriterVersion, bufferSize, compressionKind, footerSlice, footerSize, metadataSlice, metadataSize, dwrfStripeCacheData);
}
Also used : CompressionKind(com.facebook.presto.orc.metadata.CompressionKind) OrcFileTail(com.facebook.presto.orc.metadata.OrcFileTail) PostScript(com.facebook.presto.orc.metadata.PostScript) DwrfStripeCacheMode(com.facebook.presto.orc.metadata.DwrfStripeCacheMode) Slice(io.airlift.slice.Slice) DwrfStripeCacheData(com.facebook.presto.orc.metadata.DwrfStripeCacheData) OrcCorruptionException(com.facebook.presto.orc.OrcCorruptionException)

Aggregations

CompressionKind (com.facebook.presto.orc.metadata.CompressionKind)5 Type (com.facebook.presto.common.type.Type)3 Block (com.facebook.presto.common.block.Block)2 VARCHAR (com.facebook.presto.common.type.VarcharType.VARCHAR)2 DWRF (com.facebook.presto.orc.OrcTester.Format.DWRF)2 ORC_11 (com.facebook.presto.orc.OrcTester.Format.ORC_11)2 ORC_12 (com.facebook.presto.orc.OrcTester.Format.ORC_12)2 TestingOrcPredicate.createOrcPredicate (com.facebook.presto.orc.TestingOrcPredicate.createOrcPredicate)2 LZ4 (com.facebook.presto.orc.metadata.CompressionKind.LZ4)2 NONE (com.facebook.presto.orc.metadata.CompressionKind.NONE)2 SNAPPY (com.facebook.presto.orc.metadata.CompressionKind.SNAPPY)2 ZLIB (com.facebook.presto.orc.metadata.CompressionKind.ZLIB)2 ZSTD (com.facebook.presto.orc.metadata.CompressionKind.ZSTD)2 PrestoException (com.facebook.presto.spi.PrestoException)2 ImmutableList (com.google.common.collect.ImmutableList)2 ImmutableList.toImmutableList (com.google.common.collect.ImmutableList.toImmutableList)2 ImmutableMap (com.google.common.collect.ImmutableMap)2 ImmutableMap.toImmutableMap (com.google.common.collect.ImmutableMap.toImmutableMap)2 ImmutableSet (com.google.common.collect.ImmutableSet)2 Slice (io.airlift.slice.Slice)2