Search in sources :

Example 11 with ParquetReaderConfig

use of io.trino.plugin.hive.parquet.ParquetReaderConfig in project trino by trinodb.

the class TestHiveFileFormats method testParquetPageSourceSchemaEvolution.

@Test(dataProvider = "rowCount")
public void testParquetPageSourceSchemaEvolution(int rowCount) throws Exception {
    List<TestColumn> writeColumns = getTestColumnsSupportedByParquet();
    // test index-based access
    List<TestColumn> readColumns = writeColumns.stream().map(column -> new TestColumn(column.getName() + "_new", column.getObjectInspector(), column.getWriteValue(), column.getExpectedValue(), column.isPartitionKey())).collect(toList());
    assertThatFileFormat(PARQUET).withWriteColumns(writeColumns).withReadColumns(readColumns).withSession(PARQUET_SESSION).withRowsCount(rowCount).isReadableByPageSource(new ParquetPageSourceFactory(HDFS_ENVIRONMENT, STATS, new ParquetReaderConfig(), new HiveConfig()));
    // test name-based access
    readColumns = Lists.reverse(writeColumns);
    assertThatFileFormat(PARQUET).withWriteColumns(writeColumns).withReadColumns(readColumns).withSession(PARQUET_SESSION_USE_NAME).isReadableByPageSource(new ParquetPageSourceFactory(HDFS_ENVIRONMENT, STATS, new ParquetReaderConfig(), new HiveConfig()));
}
Also used : OrcFileWriterFactory(io.trino.plugin.hive.orc.OrcFileWriterFactory) ParquetFileWriterFactory(io.trino.plugin.hive.parquet.ParquetFileWriterFactory) Test(org.testng.annotations.Test) NO_ACID_TRANSACTION(io.trino.plugin.hive.acid.AcidTransaction.NO_ACID_TRANSACTION) HiveTestUtils.createGenericHiveRecordCursorProvider(io.trino.plugin.hive.HiveTestUtils.createGenericHiveRecordCursorProvider) TrinoExceptionAssert.assertTrinoExceptionThrownBy(io.trino.testing.assertions.TrinoExceptionAssert.assertTrinoExceptionThrownBy) PARQUET(io.trino.plugin.hive.HiveStorageFormat.PARQUET) FileSplit(org.apache.hadoop.mapred.FileSplit) Locale(java.util.Locale) Configuration(org.apache.hadoop.conf.Configuration) StructuralTestUtil.rowBlockOf(io.trino.testing.StructuralTestUtil.rowBlockOf) Slices.utf8Slice(io.airlift.slice.Slices.utf8Slice) ConnectorPageSource(io.trino.spi.connector.ConnectorPageSource) ObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector) AVRO(io.trino.plugin.hive.HiveStorageFormat.AVRO) SERIALIZATION_LIB(org.apache.hadoop.hive.serde.serdeConstants.SERIALIZATION_LIB) LzoCodec(io.airlift.compress.lzo.LzoCodec) ImmutableSet(com.google.common.collect.ImmutableSet) TimeZone(java.util.TimeZone) MapObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector) BeforeClass(org.testng.annotations.BeforeClass) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) Set(java.util.Set) Assert.assertNotNull(org.testng.Assert.assertNotNull) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector) Instant(java.time.Instant) Collectors(java.util.stream.Collectors) HDFS_ENVIRONMENT(io.trino.plugin.hive.HiveTestUtils.HDFS_ENVIRONMENT) String.format(java.lang.String.format) Preconditions.checkState(com.google.common.base.Preconditions.checkState) List(java.util.List) ColumnMapping.buildColumnMappings(io.trino.plugin.hive.HivePageSourceProvider.ColumnMapping.buildColumnMappings) OrcReaderConfig(io.trino.plugin.hive.orc.OrcReaderConfig) VarcharTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.VarcharTypeInfo) Optional(java.util.Optional) ParquetReaderConfig(io.trino.plugin.hive.parquet.ParquetReaderConfig) StructField(org.apache.hadoop.hive.serde2.objectinspector.StructField) RcFilePageSourceFactory(io.trino.plugin.hive.rcfile.RcFilePageSourceFactory) ListObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector) DataProvider(org.testng.annotations.DataProvider) PrimitiveCategory(org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory) Type(io.trino.spi.type.Type) Assert.assertEquals(org.testng.Assert.assertEquals) CSV(io.trino.plugin.hive.HiveStorageFormat.CSV) OptionalInt(java.util.OptionalInt) PrimitiveObjectInspectorFactory.getPrimitiveJavaObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.getPrimitiveJavaObjectInspector) LzopCodec(io.airlift.compress.lzo.LzopCodec) SymlinkTextInputFormat(org.apache.hadoop.hive.ql.io.SymlinkTextInputFormat) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) ParquetPageSourceFactory(io.trino.plugin.hive.parquet.ParquetPageSourceFactory) HiveVarchar(org.apache.hadoop.hive.common.type.HiveVarchar) ParquetWriterConfig(io.trino.plugin.hive.parquet.ParquetWriterConfig) Lists(com.google.common.collect.Lists) ImmutableList(com.google.common.collect.ImmutableList) PrimitiveObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector) SEQUENCEFILE(io.trino.plugin.hive.HiveStorageFormat.SEQUENCEFILE) OrcReaderOptions(io.trino.orc.OrcReaderOptions) OrcPageSourceFactory(io.trino.plugin.hive.orc.OrcPageSourceFactory) RecordPageSource(io.trino.spi.connector.RecordPageSource) Objects.requireNonNull(java.util.Objects.requireNonNull) TEXTFILE(io.trino.plugin.hive.HiveStorageFormat.TEXTFILE) JSON(io.trino.plugin.hive.HiveStorageFormat.JSON) OrcWriterConfig(io.trino.plugin.hive.orc.OrcWriterConfig) RCBINARY(io.trino.plugin.hive.HiveStorageFormat.RCBINARY) RecordCursor(io.trino.spi.connector.RecordCursor) Properties(java.util.Properties) ORC(io.trino.plugin.hive.HiveStorageFormat.ORC) HiveTestUtils.getTypes(io.trino.plugin.hive.HiveTestUtils.getTypes) TESTING_TYPE_MANAGER(io.trino.type.InternalTypeManager.TESTING_TYPE_MANAGER) IOException(java.io.IOException) ConnectorSession(io.trino.spi.connector.ConnectorSession) ObjectInspectorFactory.getStandardStructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory.getStandardStructObjectInspector) TupleDomain(io.trino.spi.predicate.TupleDomain) UTC(org.joda.time.DateTimeZone.UTC) File(java.io.File) TestingConnectorSession(io.trino.testing.TestingConnectorSession) SESSION(io.trino.plugin.hive.HiveTestUtils.SESSION) HiveTestUtils.getHiveSession(io.trino.plugin.hive.HiveTestUtils.getHiveSession) Collectors.toList(java.util.stream.Collectors.toList) OrcWriterOptions(io.trino.orc.OrcWriterOptions) RCTEXT(io.trino.plugin.hive.HiveStorageFormat.RCTEXT) FILE_INPUT_FORMAT(org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.FILE_INPUT_FORMAT) Assert.assertTrue(org.testng.Assert.assertTrue) PrimitiveObjectInspectorFactory.javaStringObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaStringObjectInspector) ParquetPageSourceFactory(io.trino.plugin.hive.parquet.ParquetPageSourceFactory) ParquetReaderConfig(io.trino.plugin.hive.parquet.ParquetReaderConfig) Test(org.testng.annotations.Test)

Example 12 with ParquetReaderConfig

use of io.trino.plugin.hive.parquet.ParquetReaderConfig in project trino by trinodb.

the class TestHiveFileFormats method testTruncateVarcharColumn.

@Test
public void testTruncateVarcharColumn() throws Exception {
    TestColumn writeColumn = new TestColumn("varchar_column", getPrimitiveJavaObjectInspector(new VarcharTypeInfo(4)), new HiveVarchar("test", 4), utf8Slice("test"));
    TestColumn readColumn = new TestColumn("varchar_column", getPrimitiveJavaObjectInspector(new VarcharTypeInfo(3)), new HiveVarchar("tes", 3), utf8Slice("tes"));
    assertThatFileFormat(RCTEXT).withWriteColumns(ImmutableList.of(writeColumn)).withReadColumns(ImmutableList.of(readColumn)).isReadableByPageSource(new RcFilePageSourceFactory(TESTING_TYPE_MANAGER, HDFS_ENVIRONMENT, STATS, new HiveConfig())).isReadableByRecordCursor(createGenericHiveRecordCursorProvider(HDFS_ENVIRONMENT));
    assertThatFileFormat(RCBINARY).withWriteColumns(ImmutableList.of(writeColumn)).withReadColumns(ImmutableList.of(readColumn)).isReadableByPageSource(new RcFilePageSourceFactory(TESTING_TYPE_MANAGER, HDFS_ENVIRONMENT, STATS, new HiveConfig())).isReadableByRecordCursor(createGenericHiveRecordCursorProvider(HDFS_ENVIRONMENT));
    assertThatFileFormat(ORC).withWriteColumns(ImmutableList.of(writeColumn)).withReadColumns(ImmutableList.of(readColumn)).isReadableByPageSource(new OrcPageSourceFactory(new OrcReaderOptions(), HDFS_ENVIRONMENT, STATS, UTC));
    assertThatFileFormat(PARQUET).withWriteColumns(ImmutableList.of(writeColumn)).withReadColumns(ImmutableList.of(readColumn)).withSession(PARQUET_SESSION).isReadableByPageSource(new ParquetPageSourceFactory(HDFS_ENVIRONMENT, STATS, new ParquetReaderConfig(), new HiveConfig()));
    assertThatFileFormat(AVRO).withWriteColumns(ImmutableList.of(writeColumn)).withReadColumns(ImmutableList.of(readColumn)).isReadableByRecordCursor(createGenericHiveRecordCursorProvider(HDFS_ENVIRONMENT));
    assertThatFileFormat(SEQUENCEFILE).withWriteColumns(ImmutableList.of(writeColumn)).withReadColumns(ImmutableList.of(readColumn)).isReadableByRecordCursor(createGenericHiveRecordCursorProvider(HDFS_ENVIRONMENT));
    assertThatFileFormat(TEXTFILE).withWriteColumns(ImmutableList.of(writeColumn)).withReadColumns(ImmutableList.of(readColumn)).isReadableByRecordCursor(createGenericHiveRecordCursorProvider(HDFS_ENVIRONMENT));
}
Also used : VarcharTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.VarcharTypeInfo) OrcReaderOptions(io.trino.orc.OrcReaderOptions) HiveVarchar(org.apache.hadoop.hive.common.type.HiveVarchar) OrcPageSourceFactory(io.trino.plugin.hive.orc.OrcPageSourceFactory) RcFilePageSourceFactory(io.trino.plugin.hive.rcfile.RcFilePageSourceFactory) ParquetPageSourceFactory(io.trino.plugin.hive.parquet.ParquetPageSourceFactory) ParquetReaderConfig(io.trino.plugin.hive.parquet.ParquetReaderConfig) Test(org.testng.annotations.Test)

Example 13 with ParquetReaderConfig

use of io.trino.plugin.hive.parquet.ParquetReaderConfig in project trino by trinodb.

the class TestHiveFileFormats method testParquetPageSourceGzip.

@Test(dataProvider = "validRowAndFileSizePadding")
public void testParquetPageSourceGzip(int rowCount, long fileSizePadding) throws Exception {
    List<TestColumn> testColumns = getTestColumnsSupportedByParquet();
    assertThatFileFormat(PARQUET).withColumns(testColumns).withSession(PARQUET_SESSION).withCompressionCodec(HiveCompressionCodec.GZIP).withFileSizePadding(fileSizePadding).withRowsCount(rowCount).isReadableByPageSource(new ParquetPageSourceFactory(HDFS_ENVIRONMENT, STATS, new ParquetReaderConfig(), new HiveConfig()));
}
Also used : ParquetPageSourceFactory(io.trino.plugin.hive.parquet.ParquetPageSourceFactory) ParquetReaderConfig(io.trino.plugin.hive.parquet.ParquetReaderConfig) Test(org.testng.annotations.Test)

Example 14 with ParquetReaderConfig

use of io.trino.plugin.hive.parquet.ParquetReaderConfig in project trino by trinodb.

the class TestCheckpointWriter method readCheckpoint.

private CheckpointEntries readCheckpoint(Path checkpointPath, MetadataEntry metadataEntry, boolean rowStatisticsEnabled) throws IOException {
    FileSystem fileSystem = hdfsEnvironment.getFileSystem(new HdfsEnvironment.HdfsContext(session), checkpointPath);
    FileStatus fileStatus = fileSystem.getFileStatus(checkpointPath);
    Iterator<DeltaLakeTransactionLogEntry> checkpointEntryIterator = new CheckpointEntryIterator(checkpointPath, session, fileStatus.getLen(), checkpointSchemaManager, typeManager, ImmutableSet.of(METADATA, PROTOCOL, TRANSACTION, ADD, REMOVE), Optional.of(metadataEntry), hdfsEnvironment, new FileFormatDataSourceStats(), new ParquetReaderConfig().toParquetReaderOptions(), rowStatisticsEnabled);
    CheckpointBuilder checkpointBuilder = new CheckpointBuilder();
    while (checkpointEntryIterator.hasNext()) {
        DeltaLakeTransactionLogEntry entry = checkpointEntryIterator.next();
        checkpointBuilder.addLogEntry(entry);
    }
    return checkpointBuilder.build();
}
Also used : FileStatus(org.apache.hadoop.fs.FileStatus) DeltaLakeTransactionLogEntry(io.trino.plugin.deltalake.transactionlog.DeltaLakeTransactionLogEntry) FileSystem(org.apache.hadoop.fs.FileSystem) FileFormatDataSourceStats(io.trino.plugin.hive.FileFormatDataSourceStats) HdfsEnvironment(io.trino.plugin.hive.HdfsEnvironment) ParquetReaderConfig(io.trino.plugin.hive.parquet.ParquetReaderConfig)

Example 15 with ParquetReaderConfig

use of io.trino.plugin.hive.parquet.ParquetReaderConfig in project trino by trinodb.

the class TestTransactionLogAccess method setupTransactionLogAccess.

private void setupTransactionLogAccess(String tableName, Path tableLocation) throws IOException {
    TestingConnectorContext context = new TestingConnectorContext();
    TypeManager typeManager = context.getTypeManager();
    HdfsConfig hdfsConfig = new HdfsConfig();
    HdfsConfiguration hdfsConfiguration = new HiveHdfsConfiguration(new HdfsConfigurationInitializer(hdfsConfig), ImmutableSet.of());
    HdfsEnvironment hdfsEnvironment = new HdfsEnvironment(hdfsConfiguration, hdfsConfig, new NoHdfsAuthentication());
    FileFormatDataSourceStats fileFormatDataSourceStats = new FileFormatDataSourceStats();
    transactionLogAccess = new TrackingTransactionLogAccess(tableName, tableLocation, SESSION, typeManager, new CheckpointSchemaManager(typeManager), new DeltaLakeConfig(), fileFormatDataSourceStats, hdfsEnvironment, new ParquetReaderConfig());
    DeltaLakeTableHandle tableHandle = new DeltaLakeTableHandle("schema", tableName, "location", // ignored
    Optional.empty(), TupleDomain.none(), TupleDomain.none(), Optional.empty(), Optional.empty(), Optional.empty(), Optional.empty(), Optional.empty(), 0);
    tableSnapshot = transactionLogAccess.loadSnapshot(tableHandle.getSchemaTableName(), tableLocation, SESSION);
}
Also used : HdfsConfigurationInitializer(io.trino.plugin.hive.HdfsConfigurationInitializer) HiveHdfsConfiguration(io.trino.plugin.hive.HiveHdfsConfiguration) HdfsConfig(io.trino.plugin.hive.HdfsConfig) FileFormatDataSourceStats(io.trino.plugin.hive.FileFormatDataSourceStats) TestingConnectorContext(io.trino.testing.TestingConnectorContext) HiveHdfsConfiguration(io.trino.plugin.hive.HiveHdfsConfiguration) HdfsConfiguration(io.trino.plugin.hive.HdfsConfiguration) NoHdfsAuthentication(io.trino.plugin.hive.authentication.NoHdfsAuthentication) HdfsEnvironment(io.trino.plugin.hive.HdfsEnvironment) CheckpointSchemaManager(io.trino.plugin.deltalake.transactionlog.checkpoint.CheckpointSchemaManager) TypeManager(io.trino.spi.type.TypeManager) ParquetReaderConfig(io.trino.plugin.hive.parquet.ParquetReaderConfig)

Aggregations

ParquetReaderConfig (io.trino.plugin.hive.parquet.ParquetReaderConfig)15 Test (org.testng.annotations.Test)10 ParquetPageSourceFactory (io.trino.plugin.hive.parquet.ParquetPageSourceFactory)7 FileFormatDataSourceStats (io.trino.plugin.hive.FileFormatDataSourceStats)6 HdfsEnvironment (io.trino.plugin.hive.HdfsEnvironment)5 OrcReaderOptions (io.trino.orc.OrcReaderOptions)4 CheckpointSchemaManager (io.trino.plugin.deltalake.transactionlog.checkpoint.CheckpointSchemaManager)4 HdfsConfig (io.trino.plugin.hive.HdfsConfig)4 HdfsConfiguration (io.trino.plugin.hive.HdfsConfiguration)4 HdfsConfigurationInitializer (io.trino.plugin.hive.HdfsConfigurationInitializer)4 HiveHdfsConfiguration (io.trino.plugin.hive.HiveHdfsConfiguration)4 NoHdfsAuthentication (io.trino.plugin.hive.authentication.NoHdfsAuthentication)4 OrcPageSourceFactory (io.trino.plugin.hive.orc.OrcPageSourceFactory)4 File (java.io.File)4 ImmutableList (com.google.common.collect.ImmutableList)3 ImmutableList.toImmutableList (com.google.common.collect.ImmutableList.toImmutableList)3 ParquetWriterConfig (io.trino.plugin.hive.parquet.ParquetWriterConfig)3 RcFilePageSourceFactory (io.trino.plugin.hive.rcfile.RcFilePageSourceFactory)3 ConnectorSession (io.trino.spi.connector.ConnectorSession)3 TestingConnectorSession (io.trino.testing.TestingConnectorSession)3