Search in sources :

Example 1 with PARQUET

use of io.trino.plugin.hive.HiveStorageFormat.PARQUET in project trino by trinodb.

the class DeltaLakePageSink method createParquetFileWriter.

private FileWriter createParquetFileWriter(Path path) {
    ParquetWriterOptions parquetWriterOptions = ParquetWriterOptions.builder().setMaxBlockSize(getParquetWriterBlockSize(session)).setMaxPageSize(getParquetWriterPageSize(session)).build();
    CompressionCodecName compressionCodecName = getCompressionCodec(session).getParquetCompressionCodec();
    try {
        FileSystem fileSystem = hdfsEnvironment.getFileSystem(session.getIdentity(), path, conf);
        Callable<Void> rollbackAction = () -> {
            fileSystem.delete(path, false);
            return null;
        };
        List<Type> parquetTypes = dataColumnTypes.stream().map(type -> {
            if (type instanceof TimestampWithTimeZoneType) {
                verify(((TimestampWithTimeZoneType) type).getPrecision() == 3, "Unsupported type: %s", type);
                return TIMESTAMP_MILLIS;
            }
            return type;
        }).collect(toImmutableList());
        // we use identity column mapping; input page already contains only data columns per
        // DataLagePageSink.getDataPage()
        int[] identityMapping = new int[dataColumnTypes.size()];
        for (int i = 0; i < identityMapping.length; ++i) {
            identityMapping[i] = i;
        }
        ParquetSchemaConverter schemaConverter = new ParquetSchemaConverter(parquetTypes, dataColumnNames);
        return new ParquetFileWriter(fileSystem.create(path), rollbackAction, parquetTypes, schemaConverter.getMessageType(), schemaConverter.getPrimitiveTypes(), parquetWriterOptions, identityMapping, compressionCodecName, trinoVersion);
    } catch (IOException e) {
        throw new TrinoException(DELTA_LAKE_BAD_WRITE, "Error creating Parquet file", e);
    }
}
Also used : RecordFileWriter(io.trino.plugin.hive.RecordFileWriter) DateTimeZone(org.joda.time.DateTimeZone) FileSystem(org.apache.hadoop.fs.FileSystem) CompressionConfigUtil.configureCompression(io.trino.plugin.hive.util.CompressionConfigUtil.configureCompression) TransactionLogAccess(io.trino.plugin.deltalake.transactionlog.TransactionLogAccess) Slices.wrappedBuffer(io.airlift.slice.Slices.wrappedBuffer) PARQUET(io.trino.plugin.hive.HiveStorageFormat.PARQUET) TimestampWithTimeZoneType(io.trino.spi.type.TimestampWithTimeZoneType) Block(io.trino.spi.block.Block) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) ParquetFileWriter(io.trino.plugin.hive.parquet.ParquetFileWriter) Path(org.apache.hadoop.fs.Path) DeltaLakeSessionProperties.getParquetWriterPageSize(io.trino.plugin.deltalake.DeltaLakeSessionProperties.getParquetWriterPageSize) HdfsEnvironment(io.trino.plugin.hive.HdfsEnvironment) ParquetSchemaConverter(io.trino.parquet.writer.ParquetSchemaConverter) Collection(java.util.Collection) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) DeltaLakeSessionProperties.isParquetOptimizedWriterEnabled(io.trino.plugin.deltalake.DeltaLakeSessionProperties.isParquetOptimizedWriterEnabled) TrinoException(io.trino.spi.TrinoException) FileUtils.escapePathName(org.apache.hadoop.hive.common.FileUtils.escapePathName) String.format(java.lang.String.format) Collectors.joining(java.util.stream.Collectors.joining) DeltaLakeSessionProperties.getParquetWriterBlockSize(io.trino.plugin.deltalake.DeltaLakeSessionProperties.getParquetWriterBlockSize) List(java.util.List) ImmutableMap.toImmutableMap(com.google.common.collect.ImmutableMap.toImmutableMap) Function.identity(java.util.function.Function.identity) FileUtils(org.apache.hadoop.hive.common.FileUtils) Optional(java.util.Optional) CompressionCodecName(org.apache.parquet.hadoop.metadata.CompressionCodecName) JsonCodec(io.airlift.json.JsonCodec) ListenableFuture(com.google.common.util.concurrent.ListenableFuture) Slice(io.airlift.slice.Slice) Logger(io.airlift.log.Logger) TIMESTAMP_MILLIS(io.trino.spi.type.TimestampType.TIMESTAMP_MILLIS) Type(io.trino.spi.type.Type) ConfigurationUtils.toJobConf(io.trino.plugin.hive.util.ConfigurationUtils.toJobConf) Page(io.trino.spi.Page) Callable(java.util.concurrent.Callable) CompletableFuture(java.util.concurrent.CompletableFuture) IOConstants(org.apache.hadoop.hive.ql.io.IOConstants) StorageFormat.fromHiveStorageFormat(io.trino.plugin.hive.metastore.StorageFormat.fromHiveStorageFormat) ArrayList(java.util.ArrayList) FileWriter(io.trino.plugin.hive.FileWriter) HiveType(io.trino.plugin.hive.HiveType) ImmutableList(com.google.common.collect.ImmutableList) Verify.verify(com.google.common.base.Verify.verify) PageIndexerFactory(io.trino.spi.PageIndexerFactory) Objects.requireNonNull(java.util.Objects.requireNonNull) ParquetWriterOptions(io.trino.parquet.writer.ParquetWriterOptions) HiveWriteUtils(io.trino.plugin.hive.util.HiveWriteUtils) HiveTypeName(io.trino.plugin.hive.HiveTypeName) ConnectorPageSink(io.trino.spi.connector.ConnectorPageSink) Properties(java.util.Properties) DELTA_LAKE_BAD_WRITE(io.trino.plugin.deltalake.DeltaLakeErrorCode.DELTA_LAKE_BAD_WRITE) MoreFutures(io.airlift.concurrent.MoreFutures) HivePartitionKey(io.trino.plugin.hive.HivePartitionKey) IOException(java.io.IOException) ConnectorSession(io.trino.spi.connector.ConnectorSession) Ints(com.google.common.primitives.Ints) JobConf(org.apache.hadoop.mapred.JobConf) DeltaLakeSessionProperties.getCompressionCodec(io.trino.plugin.deltalake.DeltaLakeSessionProperties.getCompressionCodec) PageIndexer(io.trino.spi.PageIndexer) Futures(com.google.common.util.concurrent.Futures) UUID.randomUUID(java.util.UUID.randomUUID) Collectors.toList(java.util.stream.Collectors.toList) TypeManager(io.trino.spi.type.TypeManager) IOException(java.io.IOException) TimestampWithTimeZoneType(io.trino.spi.type.TimestampWithTimeZoneType) Type(io.trino.spi.type.Type) HiveType(io.trino.plugin.hive.HiveType) ParquetFileWriter(io.trino.plugin.hive.parquet.ParquetFileWriter) CompressionCodecName(org.apache.parquet.hadoop.metadata.CompressionCodecName) FileSystem(org.apache.hadoop.fs.FileSystem) TimestampWithTimeZoneType(io.trino.spi.type.TimestampWithTimeZoneType) TrinoException(io.trino.spi.TrinoException) ParquetWriterOptions(io.trino.parquet.writer.ParquetWriterOptions) ParquetSchemaConverter(io.trino.parquet.writer.ParquetSchemaConverter)

Example 2 with PARQUET

use of io.trino.plugin.hive.HiveStorageFormat.PARQUET in project trino by trinodb.

the class TestHiveFileFormats method testParquetPageSourceSchemaEvolution.

@Test(dataProvider = "rowCount")
public void testParquetPageSourceSchemaEvolution(int rowCount) throws Exception {
    List<TestColumn> writeColumns = getTestColumnsSupportedByParquet();
    // test index-based access
    List<TestColumn> readColumns = writeColumns.stream().map(column -> new TestColumn(column.getName() + "_new", column.getObjectInspector(), column.getWriteValue(), column.getExpectedValue(), column.isPartitionKey())).collect(toList());
    assertThatFileFormat(PARQUET).withWriteColumns(writeColumns).withReadColumns(readColumns).withSession(PARQUET_SESSION).withRowsCount(rowCount).isReadableByPageSource(new ParquetPageSourceFactory(HDFS_ENVIRONMENT, STATS, new ParquetReaderConfig(), new HiveConfig()));
    // test name-based access
    readColumns = Lists.reverse(writeColumns);
    assertThatFileFormat(PARQUET).withWriteColumns(writeColumns).withReadColumns(readColumns).withSession(PARQUET_SESSION_USE_NAME).isReadableByPageSource(new ParquetPageSourceFactory(HDFS_ENVIRONMENT, STATS, new ParquetReaderConfig(), new HiveConfig()));
}
Also used : OrcFileWriterFactory(io.trino.plugin.hive.orc.OrcFileWriterFactory) ParquetFileWriterFactory(io.trino.plugin.hive.parquet.ParquetFileWriterFactory) Test(org.testng.annotations.Test) NO_ACID_TRANSACTION(io.trino.plugin.hive.acid.AcidTransaction.NO_ACID_TRANSACTION) HiveTestUtils.createGenericHiveRecordCursorProvider(io.trino.plugin.hive.HiveTestUtils.createGenericHiveRecordCursorProvider) TrinoExceptionAssert.assertTrinoExceptionThrownBy(io.trino.testing.assertions.TrinoExceptionAssert.assertTrinoExceptionThrownBy) PARQUET(io.trino.plugin.hive.HiveStorageFormat.PARQUET) FileSplit(org.apache.hadoop.mapred.FileSplit) Locale(java.util.Locale) Configuration(org.apache.hadoop.conf.Configuration) StructuralTestUtil.rowBlockOf(io.trino.testing.StructuralTestUtil.rowBlockOf) Slices.utf8Slice(io.airlift.slice.Slices.utf8Slice) ConnectorPageSource(io.trino.spi.connector.ConnectorPageSource) ObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector) AVRO(io.trino.plugin.hive.HiveStorageFormat.AVRO) SERIALIZATION_LIB(org.apache.hadoop.hive.serde.serdeConstants.SERIALIZATION_LIB) LzoCodec(io.airlift.compress.lzo.LzoCodec) ImmutableSet(com.google.common.collect.ImmutableSet) TimeZone(java.util.TimeZone) MapObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector) BeforeClass(org.testng.annotations.BeforeClass) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) Set(java.util.Set) Assert.assertNotNull(org.testng.Assert.assertNotNull) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector) Instant(java.time.Instant) Collectors(java.util.stream.Collectors) HDFS_ENVIRONMENT(io.trino.plugin.hive.HiveTestUtils.HDFS_ENVIRONMENT) String.format(java.lang.String.format) Preconditions.checkState(com.google.common.base.Preconditions.checkState) List(java.util.List) ColumnMapping.buildColumnMappings(io.trino.plugin.hive.HivePageSourceProvider.ColumnMapping.buildColumnMappings) OrcReaderConfig(io.trino.plugin.hive.orc.OrcReaderConfig) VarcharTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.VarcharTypeInfo) Optional(java.util.Optional) ParquetReaderConfig(io.trino.plugin.hive.parquet.ParquetReaderConfig) StructField(org.apache.hadoop.hive.serde2.objectinspector.StructField) RcFilePageSourceFactory(io.trino.plugin.hive.rcfile.RcFilePageSourceFactory) ListObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector) DataProvider(org.testng.annotations.DataProvider) PrimitiveCategory(org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory) Type(io.trino.spi.type.Type) Assert.assertEquals(org.testng.Assert.assertEquals) CSV(io.trino.plugin.hive.HiveStorageFormat.CSV) OptionalInt(java.util.OptionalInt) PrimitiveObjectInspectorFactory.getPrimitiveJavaObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.getPrimitiveJavaObjectInspector) LzopCodec(io.airlift.compress.lzo.LzopCodec) SymlinkTextInputFormat(org.apache.hadoop.hive.ql.io.SymlinkTextInputFormat) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) ParquetPageSourceFactory(io.trino.plugin.hive.parquet.ParquetPageSourceFactory) HiveVarchar(org.apache.hadoop.hive.common.type.HiveVarchar) ParquetWriterConfig(io.trino.plugin.hive.parquet.ParquetWriterConfig) Lists(com.google.common.collect.Lists) ImmutableList(com.google.common.collect.ImmutableList) PrimitiveObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector) SEQUENCEFILE(io.trino.plugin.hive.HiveStorageFormat.SEQUENCEFILE) OrcReaderOptions(io.trino.orc.OrcReaderOptions) OrcPageSourceFactory(io.trino.plugin.hive.orc.OrcPageSourceFactory) RecordPageSource(io.trino.spi.connector.RecordPageSource) Objects.requireNonNull(java.util.Objects.requireNonNull) TEXTFILE(io.trino.plugin.hive.HiveStorageFormat.TEXTFILE) JSON(io.trino.plugin.hive.HiveStorageFormat.JSON) OrcWriterConfig(io.trino.plugin.hive.orc.OrcWriterConfig) RCBINARY(io.trino.plugin.hive.HiveStorageFormat.RCBINARY) RecordCursor(io.trino.spi.connector.RecordCursor) Properties(java.util.Properties) ORC(io.trino.plugin.hive.HiveStorageFormat.ORC) HiveTestUtils.getTypes(io.trino.plugin.hive.HiveTestUtils.getTypes) TESTING_TYPE_MANAGER(io.trino.type.InternalTypeManager.TESTING_TYPE_MANAGER) IOException(java.io.IOException) ConnectorSession(io.trino.spi.connector.ConnectorSession) ObjectInspectorFactory.getStandardStructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory.getStandardStructObjectInspector) TupleDomain(io.trino.spi.predicate.TupleDomain) UTC(org.joda.time.DateTimeZone.UTC) File(java.io.File) TestingConnectorSession(io.trino.testing.TestingConnectorSession) SESSION(io.trino.plugin.hive.HiveTestUtils.SESSION) HiveTestUtils.getHiveSession(io.trino.plugin.hive.HiveTestUtils.getHiveSession) Collectors.toList(java.util.stream.Collectors.toList) OrcWriterOptions(io.trino.orc.OrcWriterOptions) RCTEXT(io.trino.plugin.hive.HiveStorageFormat.RCTEXT) FILE_INPUT_FORMAT(org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.FILE_INPUT_FORMAT) Assert.assertTrue(org.testng.Assert.assertTrue) PrimitiveObjectInspectorFactory.javaStringObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaStringObjectInspector) ParquetPageSourceFactory(io.trino.plugin.hive.parquet.ParquetPageSourceFactory) ParquetReaderConfig(io.trino.plugin.hive.parquet.ParquetReaderConfig) Test(org.testng.annotations.Test)

Aggregations

ImmutableList (com.google.common.collect.ImmutableList)2 ImmutableList.toImmutableList (com.google.common.collect.ImmutableList.toImmutableList)2 PARQUET (io.trino.plugin.hive.HiveStorageFormat.PARQUET)2 ConnectorSession (io.trino.spi.connector.ConnectorSession)2 Type (io.trino.spi.type.Type)2 IOException (java.io.IOException)2 String.format (java.lang.String.format)2 ArrayList (java.util.ArrayList)2 List (java.util.List)2 Objects.requireNonNull (java.util.Objects.requireNonNull)2 Optional (java.util.Optional)2 Properties (java.util.Properties)2 Collectors.toList (java.util.stream.Collectors.toList)2 Configuration (org.apache.hadoop.conf.Configuration)2 Preconditions.checkState (com.google.common.base.Preconditions.checkState)1 Verify.verify (com.google.common.base.Verify.verify)1 ImmutableMap.toImmutableMap (com.google.common.collect.ImmutableMap.toImmutableMap)1 ImmutableSet (com.google.common.collect.ImmutableSet)1 Lists (com.google.common.collect.Lists)1 Ints (com.google.common.primitives.Ints)1