Search in sources :

Example 1 with HDFS_ENVIRONMENT

use of io.trino.plugin.hive.HiveTestUtils.HDFS_ENVIRONMENT in project trino by trinodb.

the class TestHiveFileFormats method testRCBinaryProjectedColumns.

@Test(dataProvider = "rowCount")
public void testRCBinaryProjectedColumns(int rowCount) throws Exception {
    // RCBinary does not support complex type as key of a map and interprets empty VARCHAR as nulls
    List<TestColumn> supportedColumns = TEST_COLUMNS.stream().filter(testColumn -> {
        String name = testColumn.getName();
        return !name.equals("t_map_null_key_complex_key_value") && !name.equals("t_empty_varchar");
    }).collect(toList());
    List<TestColumn> regularColumns = getRegularColumns(supportedColumns);
    List<TestColumn> partitionColumns = getPartitionColumns(supportedColumns);
    // Created projected columns for all regular supported columns
    ImmutableList.Builder<TestColumn> writeColumnsBuilder = ImmutableList.builder();
    ImmutableList.Builder<TestColumn> readeColumnsBuilder = ImmutableList.builder();
    generateProjectedColumns(regularColumns, writeColumnsBuilder, readeColumnsBuilder);
    List<TestColumn> writeColumns = writeColumnsBuilder.addAll(partitionColumns).build();
    List<TestColumn> readColumns = readeColumnsBuilder.addAll(partitionColumns).build();
    assertThatFileFormat(RCBINARY).withWriteColumns(writeColumns).withReadColumns(readColumns).withRowsCount(rowCount).withFileWriterFactory(new RcFileFileWriterFactory(HDFS_ENVIRONMENT, TESTING_TYPE_MANAGER, new NodeVersion("test"), HIVE_STORAGE_TIME_ZONE, STATS)).isReadableByPageSource(new RcFilePageSourceFactory(TESTING_TYPE_MANAGER, HDFS_ENVIRONMENT, STATS, new HiveConfig()));
}
Also used : OrcFileWriterFactory(io.trino.plugin.hive.orc.OrcFileWriterFactory) ParquetFileWriterFactory(io.trino.plugin.hive.parquet.ParquetFileWriterFactory) Test(org.testng.annotations.Test) NO_ACID_TRANSACTION(io.trino.plugin.hive.acid.AcidTransaction.NO_ACID_TRANSACTION) HiveTestUtils.createGenericHiveRecordCursorProvider(io.trino.plugin.hive.HiveTestUtils.createGenericHiveRecordCursorProvider) TrinoExceptionAssert.assertTrinoExceptionThrownBy(io.trino.testing.assertions.TrinoExceptionAssert.assertTrinoExceptionThrownBy) PARQUET(io.trino.plugin.hive.HiveStorageFormat.PARQUET) FileSplit(org.apache.hadoop.mapred.FileSplit) Locale(java.util.Locale) Configuration(org.apache.hadoop.conf.Configuration) StructuralTestUtil.rowBlockOf(io.trino.testing.StructuralTestUtil.rowBlockOf) Slices.utf8Slice(io.airlift.slice.Slices.utf8Slice) ConnectorPageSource(io.trino.spi.connector.ConnectorPageSource) ObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector) AVRO(io.trino.plugin.hive.HiveStorageFormat.AVRO) SERIALIZATION_LIB(org.apache.hadoop.hive.serde.serdeConstants.SERIALIZATION_LIB) LzoCodec(io.airlift.compress.lzo.LzoCodec) ImmutableSet(com.google.common.collect.ImmutableSet) TimeZone(java.util.TimeZone) MapObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector) BeforeClass(org.testng.annotations.BeforeClass) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) Set(java.util.Set) Assert.assertNotNull(org.testng.Assert.assertNotNull) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector) Instant(java.time.Instant) Collectors(java.util.stream.Collectors) HDFS_ENVIRONMENT(io.trino.plugin.hive.HiveTestUtils.HDFS_ENVIRONMENT) String.format(java.lang.String.format) Preconditions.checkState(com.google.common.base.Preconditions.checkState) List(java.util.List) ColumnMapping.buildColumnMappings(io.trino.plugin.hive.HivePageSourceProvider.ColumnMapping.buildColumnMappings) OrcReaderConfig(io.trino.plugin.hive.orc.OrcReaderConfig) VarcharTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.VarcharTypeInfo) Optional(java.util.Optional) ParquetReaderConfig(io.trino.plugin.hive.parquet.ParquetReaderConfig) StructField(org.apache.hadoop.hive.serde2.objectinspector.StructField) RcFilePageSourceFactory(io.trino.plugin.hive.rcfile.RcFilePageSourceFactory) ListObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector) DataProvider(org.testng.annotations.DataProvider) PrimitiveCategory(org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory) Type(io.trino.spi.type.Type) Assert.assertEquals(org.testng.Assert.assertEquals) CSV(io.trino.plugin.hive.HiveStorageFormat.CSV) OptionalInt(java.util.OptionalInt) PrimitiveObjectInspectorFactory.getPrimitiveJavaObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.getPrimitiveJavaObjectInspector) LzopCodec(io.airlift.compress.lzo.LzopCodec) SymlinkTextInputFormat(org.apache.hadoop.hive.ql.io.SymlinkTextInputFormat) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) ParquetPageSourceFactory(io.trino.plugin.hive.parquet.ParquetPageSourceFactory) HiveVarchar(org.apache.hadoop.hive.common.type.HiveVarchar) ParquetWriterConfig(io.trino.plugin.hive.parquet.ParquetWriterConfig) Lists(com.google.common.collect.Lists) ImmutableList(com.google.common.collect.ImmutableList) PrimitiveObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector) SEQUENCEFILE(io.trino.plugin.hive.HiveStorageFormat.SEQUENCEFILE) OrcReaderOptions(io.trino.orc.OrcReaderOptions) OrcPageSourceFactory(io.trino.plugin.hive.orc.OrcPageSourceFactory) RecordPageSource(io.trino.spi.connector.RecordPageSource) Objects.requireNonNull(java.util.Objects.requireNonNull) TEXTFILE(io.trino.plugin.hive.HiveStorageFormat.TEXTFILE) JSON(io.trino.plugin.hive.HiveStorageFormat.JSON) OrcWriterConfig(io.trino.plugin.hive.orc.OrcWriterConfig) RCBINARY(io.trino.plugin.hive.HiveStorageFormat.RCBINARY) RecordCursor(io.trino.spi.connector.RecordCursor) Properties(java.util.Properties) ORC(io.trino.plugin.hive.HiveStorageFormat.ORC) HiveTestUtils.getTypes(io.trino.plugin.hive.HiveTestUtils.getTypes) TESTING_TYPE_MANAGER(io.trino.type.InternalTypeManager.TESTING_TYPE_MANAGER) IOException(java.io.IOException) ConnectorSession(io.trino.spi.connector.ConnectorSession) ObjectInspectorFactory.getStandardStructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory.getStandardStructObjectInspector) TupleDomain(io.trino.spi.predicate.TupleDomain) UTC(org.joda.time.DateTimeZone.UTC) File(java.io.File) TestingConnectorSession(io.trino.testing.TestingConnectorSession) SESSION(io.trino.plugin.hive.HiveTestUtils.SESSION) HiveTestUtils.getHiveSession(io.trino.plugin.hive.HiveTestUtils.getHiveSession) Collectors.toList(java.util.stream.Collectors.toList) OrcWriterOptions(io.trino.orc.OrcWriterOptions) RCTEXT(io.trino.plugin.hive.HiveStorageFormat.RCTEXT) FILE_INPUT_FORMAT(org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.FILE_INPUT_FORMAT) Assert.assertTrue(org.testng.Assert.assertTrue) PrimitiveObjectInspectorFactory.javaStringObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaStringObjectInspector) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) ImmutableList(com.google.common.collect.ImmutableList) RcFilePageSourceFactory(io.trino.plugin.hive.rcfile.RcFilePageSourceFactory) Test(org.testng.annotations.Test)

Example 2 with HDFS_ENVIRONMENT

use of io.trino.plugin.hive.HiveTestUtils.HDFS_ENVIRONMENT in project trino by trinodb.

the class TestBackgroundHiveSplitLoader method testFullAcidTableWithOriginalFiles.

@Test
public void testFullAcidTableWithOriginalFiles() throws Exception {
    java.nio.file.Path tablePath = Files.createTempDirectory("TestBackgroundHiveSplitLoader");
    Table table = table(tablePath.toString(), ImmutableList.of(), Optional.empty(), ImmutableMap.of("transactional", "true"));
    String originalFile = tablePath + "/000000_1";
    List<String> filePaths = ImmutableList.of(tablePath + "/delta_0000002_0000002_0000/_orc_acid_version", tablePath + "/delta_0000002_0000002_0000/bucket_00000");
    for (String path : filePaths) {
        File file = new File(path);
        assertTrue(file.getParentFile().exists() || file.getParentFile().mkdirs(), "Failed creating directory " + file.getParentFile());
        createOrcAcidFile(file);
    }
    Files.write(Paths.get(originalFile), "test".getBytes(UTF_8));
    // ValidWriteIdsList is of format <currentTxn>$<schema>.<table>:<highWatermark>:<minOpenWriteId>::<AbortedTxns>
    // This writeId list has high watermark transaction=3
    ValidReaderWriteIdList validWriteIdsList = new ValidReaderWriteIdList(format("4$%s.%s:3:9223372036854775807::", table.getDatabaseName(), table.getTableName()));
    BackgroundHiveSplitLoader backgroundHiveSplitLoader = backgroundHiveSplitLoader(HDFS_ENVIRONMENT, TupleDomain.all(), Optional.empty(), table, Optional.empty(), Optional.of(validWriteIdsList));
    HiveSplitSource hiveSplitSource = hiveSplitSource(backgroundHiveSplitLoader);
    backgroundHiveSplitLoader.start(hiveSplitSource);
    List<String> splits = drain(hiveSplitSource);
    assertTrue(splits.stream().anyMatch(p -> p.contains(originalFile)), format("%s not found in splits %s", filePaths.get(0), splits));
    assertTrue(splits.stream().anyMatch(p -> p.contains(filePaths.get(1))), format("%s not found in splits %s", filePaths.get(1), splits));
}
Also used : ArrayListMultimap(com.google.common.collect.ArrayListMultimap) ListMultimap(com.google.common.collect.ListMultimap) BlockLocation(org.apache.hadoop.fs.BlockLocation) FileSystem(org.apache.hadoop.fs.FileSystem) Throwables.throwIfUnchecked(com.google.common.base.Throwables.throwIfUnchecked) Test(org.testng.annotations.Test) NO_ACID_TRANSACTION(io.trino.plugin.hive.acid.AcidTransaction.NO_ACID_TRANSACTION) FileStatus(org.apache.hadoop.fs.FileStatus) TrinoExceptionAssert.assertTrinoExceptionThrownBy(io.trino.testing.assertions.TrinoExceptionAssert.assertTrinoExceptionThrownBy) Future(java.util.concurrent.Future) Column(io.trino.plugin.hive.metastore.Column) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) Slices.utf8Slice(io.airlift.slice.Slices.utf8Slice) Assert.assertFalse(org.testng.Assert.assertFalse) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) SERIALIZATION_LIB(org.apache.hadoop.hive.serde.serdeConstants.SERIALIZATION_LIB) Table(io.trino.plugin.hive.metastore.Table) ConnectorIdentity(io.trino.spi.security.ConnectorIdentity) Domain(io.trino.spi.predicate.Domain) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) Set(java.util.Set) HDFS_ENVIRONMENT(io.trino.plugin.hive.HiveTestUtils.HDFS_ENVIRONMENT) SchemaTableName(io.trino.spi.connector.SchemaTableName) MoreExecutors.directExecutor(com.google.common.util.concurrent.MoreExecutors.directExecutor) CountDownLatch(java.util.concurrent.CountDownLatch) Resources.getResource(com.google.common.io.Resources.getResource) NOT_PARTITIONED(io.trino.spi.connector.NotPartitionedPartitionHandle.NOT_PARTITIONED) HIVE_UNKNOWN_ERROR(io.trino.plugin.hive.HiveErrorCode.HIVE_UNKNOWN_ERROR) RecordReader(org.apache.hadoop.mapred.RecordReader) RemoteIterator(org.apache.hadoop.fs.RemoteIterator) MEGABYTE(io.airlift.units.DataSize.Unit.MEGABYTE) ColumnType(io.trino.plugin.hive.HiveColumnHandle.ColumnType) ArrayList(java.util.ArrayList) FSDataOutputStream(org.apache.hadoop.fs.FSDataOutputStream) VARCHAR(io.trino.spi.type.VarcharType.VARCHAR) ALLOW_INSECURE(com.google.common.io.RecursiveDeleteOption.ALLOW_INSECURE) GIGABYTE(io.airlift.units.DataSize.Unit.GIGABYTE) ColumnHandle(io.trino.spi.connector.ColumnHandle) HIVE_INVALID_BUCKET_FILES(io.trino.plugin.hive.HiveErrorCode.HIVE_INVALID_BUCKET_FILES) ConnectorSplitBatch(io.trino.spi.connector.ConnectorSplitSource.ConnectorSplitBatch) AfterClass(org.testng.annotations.AfterClass) Properties(java.util.Properties) Files(java.nio.file.Files) Reporter(org.apache.hadoop.mapred.Reporter) AbstractIterator(com.google.common.collect.AbstractIterator) LazySimpleSerDe(org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe) TESTING_TYPE_MANAGER(io.trino.type.InternalTypeManager.TESTING_TYPE_MANAGER) Throwables(com.google.common.base.Throwables) IOException(java.io.IOException) ConnectorSession(io.trino.spi.connector.ConnectorSession) File(java.io.File) ExecutionException(java.util.concurrent.ExecutionException) SESSION(io.trino.plugin.hive.HiveTestUtils.SESSION) HIVE_INT(io.trino.plugin.hive.HiveType.HIVE_INT) Paths(java.nio.file.Paths) HiveColumnHandle.pathColumnHandle(io.trino.plugin.hive.HiveColumnHandle.pathColumnHandle) TableType(org.apache.hadoop.hive.metastore.TableType) InputSplit(org.apache.hadoop.mapred.InputSplit) Progressable(org.apache.hadoop.util.Progressable) ValidWriteIdList(org.apache.hadoop.hive.common.ValidWriteIdList) MoreFiles.deleteRecursively(com.google.common.io.MoreFiles.deleteRecursively) Assertions.assertThat(org.assertj.core.api.Assertions.assertThat) HiveBucketFilter(io.trino.plugin.hive.util.HiveBucketing.HiveBucketFilter) FsPermission(org.apache.hadoop.fs.permission.FsPermission) Duration(io.airlift.units.Duration) BUCKETING_V1(io.trino.plugin.hive.util.HiveBucketing.BucketingVersion.BUCKETING_V1) NoHdfsAuthentication(io.trino.plugin.hive.authentication.NoHdfsAuthentication) Path(org.apache.hadoop.fs.Path) URI(java.net.URI) ValidReaderWriteIdList(org.apache.hadoop.hive.common.ValidReaderWriteIdList) INTEGER(io.trino.spi.type.IntegerType.INTEGER) AVRO(io.trino.plugin.hive.HiveStorageFormat.AVRO) StorageFormat(io.trino.plugin.hive.metastore.StorageFormat) ImmutableSet(com.google.common.collect.ImmutableSet) ImmutableMap(com.google.common.collect.ImmutableMap) FileInputFormat(org.apache.hadoop.mapred.FileInputFormat) BeforeClass(org.testng.annotations.BeforeClass) LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus) TrinoException(io.trino.spi.TrinoException) String.format(java.lang.String.format) DataSize(io.airlift.units.DataSize) List(java.util.List) BackgroundHiveSplitLoader.hasAttemptId(io.trino.plugin.hive.BackgroundHiveSplitLoader.hasAttemptId) DynamicFilter(io.trino.spi.connector.DynamicFilter) Optional(java.util.Optional) MoreFutures.unmodifiableFuture(io.airlift.concurrent.MoreFutures.unmodifiableFuture) DataProvider(org.testng.annotations.DataProvider) BucketSplitInfo.createBucketSplitInfo(io.trino.plugin.hive.BackgroundHiveSplitLoader.BucketSplitInfo.createBucketSplitInfo) AtomicBoolean(java.util.concurrent.atomic.AtomicBoolean) Assert.assertEquals(org.testng.Assert.assertEquals) CounterStat(io.airlift.stats.CounterStat) CompletableFuture(java.util.concurrent.CompletableFuture) CSV(io.trino.plugin.hive.HiveStorageFormat.CSV) OptionalInt(java.util.OptionalInt) SymlinkTextInputFormat(org.apache.hadoop.hive.ql.io.SymlinkTextInputFormat) AvroContainerInputFormat(org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat) ImmutableList(com.google.common.collect.ImmutableList) Assertions.assertThatThrownBy(org.assertj.core.api.Assertions.assertThatThrownBy) Threads.daemonThreadsNamed(io.airlift.concurrent.Threads.daemonThreadsNamed) HiveColumnHandle.createBaseColumn(io.trino.plugin.hive.HiveColumnHandle.createBaseColumn) HiveSessionProperties.getMaxInitialSplitSize(io.trino.plugin.hive.HiveSessionProperties.getMaxInitialSplitSize) ExecutorService(java.util.concurrent.ExecutorService) DEFAULT_PRECISION(io.trino.plugin.hive.HiveTimestampPrecision.DEFAULT_PRECISION) TupleDomain.withColumnDomains(io.trino.spi.predicate.TupleDomain.withColumnDomains) Iterator(java.util.Iterator) UTF_8(java.nio.charset.StandardCharsets.UTF_8) HiveUtil.getRegularColumnHandles(io.trino.plugin.hive.util.HiveUtil.getRegularColumnHandles) TupleDomain(io.trino.spi.predicate.TupleDomain) HIVE_STRING(io.trino.plugin.hive.HiveType.HIVE_STRING) JobConf(org.apache.hadoop.mapred.JobConf) TimeUnit(java.util.concurrent.TimeUnit) HiveTestUtils.getHiveSession(io.trino.plugin.hive.HiveTestUtils.getHiveSession) BackgroundHiveSplitLoader.getBucketNumber(io.trino.plugin.hive.BackgroundHiveSplitLoader.getBucketNumber) Executors.newCachedThreadPool(java.util.concurrent.Executors.newCachedThreadPool) FILE_INPUT_FORMAT(org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.FILE_INPUT_FORMAT) Assert.assertTrue(org.testng.Assert.assertTrue) SECONDS(java.util.concurrent.TimeUnit.SECONDS) Table(io.trino.plugin.hive.metastore.Table) ValidReaderWriteIdList(org.apache.hadoop.hive.common.ValidReaderWriteIdList) File(java.io.File) Test(org.testng.annotations.Test)

Example 3 with HDFS_ENVIRONMENT

use of io.trino.plugin.hive.HiveTestUtils.HDFS_ENVIRONMENT in project trino by trinodb.

the class TestHivePageSink method createPageSource.

private static ConnectorPageSource createPageSource(HiveTransactionHandle transaction, HiveConfig config, File outputFile) {
    Properties splitProperties = new Properties();
    splitProperties.setProperty(FILE_INPUT_FORMAT, config.getHiveStorageFormat().getInputFormat());
    splitProperties.setProperty(SERIALIZATION_LIB, config.getHiveStorageFormat().getSerde());
    splitProperties.setProperty("columns", Joiner.on(',').join(getColumnHandles().stream().map(HiveColumnHandle::getName).collect(toImmutableList())));
    splitProperties.setProperty("columns.types", Joiner.on(',').join(getColumnHandles().stream().map(HiveColumnHandle::getHiveType).map(hiveType -> hiveType.getHiveTypeName().toString()).collect(toImmutableList())));
    HiveSplit split = new HiveSplit(SCHEMA_NAME, TABLE_NAME, "", "file:///" + outputFile.getAbsolutePath(), 0, outputFile.length(), outputFile.length(), outputFile.lastModified(), splitProperties, ImmutableList.of(), ImmutableList.of(), OptionalInt.empty(), 0, false, TableToPartitionMapping.empty(), Optional.empty(), Optional.empty(), false, Optional.empty(), 0, SplitWeight.standard());
    ConnectorTableHandle table = new HiveTableHandle(SCHEMA_NAME, TABLE_NAME, ImmutableMap.of(), ImmutableList.of(), ImmutableList.of(), Optional.empty());
    HivePageSourceProvider provider = new HivePageSourceProvider(TESTING_TYPE_MANAGER, HDFS_ENVIRONMENT, config, getDefaultHivePageSourceFactories(HDFS_ENVIRONMENT, config), getDefaultHiveRecordCursorProviders(config, HDFS_ENVIRONMENT), new GenericHiveRecordCursorProvider(HDFS_ENVIRONMENT, config), Optional.empty());
    return provider.createPageSource(transaction, getHiveSession(config), split, table, ImmutableList.copyOf(getColumnHandles()), DynamicFilter.EMPTY);
}
Also used : MoreFiles.deleteRecursively(com.google.common.io.MoreFiles.deleteRecursively) MaterializedResult(io.trino.testing.MaterializedResult) Assertions.assertGreaterThan(io.airlift.testing.Assertions.assertGreaterThan) Test(org.testng.annotations.Test) NO_ACID_TRANSACTION(io.trino.plugin.hive.acid.AcidTransaction.NO_ACID_TRANSACTION) TypeOperators(io.trino.spi.type.TypeOperators) SplitWeight(io.trino.spi.SplitWeight) HiveMetastoreFactory(io.trino.plugin.hive.metastore.HiveMetastoreFactory) TpchColumnType(io.trino.tpch.TpchColumnType) Math.round(java.lang.Math.round) Slices(io.airlift.slice.Slices) ConnectorTableHandle(io.trino.spi.connector.ConnectorTableHandle) Path(org.apache.hadoop.fs.Path) LineItemColumn(io.trino.tpch.LineItemColumn) ConnectorPageSource(io.trino.spi.connector.ConnectorPageSource) INTEGER(io.trino.spi.type.IntegerType.INTEGER) Assert.assertEquals(io.trino.testing.assertions.Assert.assertEquals) SERIALIZATION_LIB(org.apache.hadoop.hive.serde.serdeConstants.SERIALIZATION_LIB) HiveTestUtils.getDefaultHiveRecordCursorProviders(io.trino.plugin.hive.HiveTestUtils.getDefaultHiveRecordCursorProviders) TestingNodeManager(io.trino.testing.TestingNodeManager) ImmutableMap(com.google.common.collect.ImmutableMap) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) HIVE_DATE(io.trino.plugin.hive.HiveType.HIVE_DATE) HDFS_ENVIRONMENT(io.trino.plugin.hive.HiveTestUtils.HDFS_ENVIRONMENT) SchemaTableName(io.trino.spi.connector.SchemaTableName) String.format(java.lang.String.format) LineItemGenerator(io.trino.tpch.LineItemGenerator) LineItem(io.trino.tpch.LineItem) List(java.util.List) Stream(java.util.stream.Stream) BIGINT(io.trino.spi.type.BigintType.BIGINT) DynamicFilter(io.trino.spi.connector.DynamicFilter) Optional(java.util.Optional) HivePageSinkMetadata(io.trino.plugin.hive.metastore.HivePageSinkMetadata) DATE(io.trino.spi.type.DateType.DATE) Joiner(com.google.common.base.Joiner) JsonCodec(io.airlift.json.JsonCodec) DIRECT_TO_TARGET_NEW_DIRECTORY(io.trino.plugin.hive.LocationHandle.WriteMode.DIRECT_TO_TARGET_NEW_DIRECTORY) HiveTestUtils.getDefaultHivePageSourceFactories(io.trino.plugin.hive.HiveTestUtils.getDefaultHivePageSourceFactories) HIVE_DOUBLE(io.trino.plugin.hive.HiveType.HIVE_DOUBLE) PageBuilder(io.trino.spi.PageBuilder) Type(io.trino.spi.type.Type) Page(io.trino.spi.Page) VarcharType.createUnboundedVarcharType(io.trino.spi.type.VarcharType.createUnboundedVarcharType) JoinCompiler(io.trino.sql.gen.JoinCompiler) OptionalInt(java.util.OptionalInt) GroupByHashPageIndexerFactory(io.trino.operator.GroupByHashPageIndexerFactory) ArrayList(java.util.ArrayList) HiveMetastore(io.trino.plugin.hive.metastore.HiveMetastore) ALLOW_INSECURE(com.google.common.io.RecursiveDeleteOption.ALLOW_INSECURE) ImmutableList(com.google.common.collect.ImmutableList) Files(com.google.common.io.Files) NONE(io.trino.plugin.hive.HiveCompressionCodec.NONE) HiveColumnHandle.createBaseColumn(io.trino.plugin.hive.HiveColumnHandle.createBaseColumn) FileHiveMetastore.createTestingFileHiveMetastore(io.trino.plugin.hive.metastore.file.FileHiveMetastore.createTestingFileHiveMetastore) ConnectorPageSink(io.trino.spi.connector.ConnectorPageSink) BlockTypeOperators(io.trino.type.BlockTypeOperators) Properties(java.util.Properties) HIVE_LONG(io.trino.plugin.hive.HiveType.HIVE_LONG) HiveTestUtils.getDefaultHiveFileWriterFactories(io.trino.plugin.hive.HiveTestUtils.getDefaultHiveFileWriterFactories) TESTING_TYPE_MANAGER(io.trino.type.InternalTypeManager.TESTING_TYPE_MANAGER) Iterables.getOnlyElement(com.google.common.collect.Iterables.getOnlyElement) HiveTestUtils.getHiveSessionProperties(io.trino.plugin.hive.HiveTestUtils.getHiveSessionProperties) ConnectorSession(io.trino.spi.connector.ConnectorSession) MoreFutures.getFutureValue(io.airlift.concurrent.MoreFutures.getFutureValue) PAGE_SORTER(io.trino.plugin.hive.HiveTestUtils.PAGE_SORTER) File(java.io.File) HIVE_STRING(io.trino.plugin.hive.HiveType.HIVE_STRING) TpchColumnTypes(io.trino.tpch.TpchColumnTypes) DOUBLE(io.trino.spi.type.DoubleType.DOUBLE) HiveTestUtils.getHiveSession(io.trino.plugin.hive.HiveTestUtils.getHiveSession) HIVE_INT(io.trino.plugin.hive.HiveType.HIVE_INT) Collectors.toList(java.util.stream.Collectors.toList) FILE_INPUT_FORMAT(org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.FILE_INPUT_FORMAT) Assert.assertTrue(org.testng.Assert.assertTrue) BlockBuilder(io.trino.spi.block.BlockBuilder) REGULAR(io.trino.plugin.hive.HiveColumnHandle.ColumnType.REGULAR) Properties(java.util.Properties) HiveTestUtils.getHiveSessionProperties(io.trino.plugin.hive.HiveTestUtils.getHiveSessionProperties) ConnectorTableHandle(io.trino.spi.connector.ConnectorTableHandle)

Example 4 with HDFS_ENVIRONMENT

use of io.trino.plugin.hive.HiveTestUtils.HDFS_ENVIRONMENT in project trino by trinodb.

the class TestOrcPredicates method createPageSource.

private ConnectorPageSource createPageSource(TupleDomain<TestColumn> effectivePredicate, List<TestColumn> columnsToRead, ConnectorSession session, FileSplit split) {
    OrcPageSourceFactory readerFactory = new OrcPageSourceFactory(new OrcReaderOptions(), HDFS_ENVIRONMENT, STATS, UTC);
    Properties splitProperties = new Properties();
    splitProperties.setProperty(FILE_INPUT_FORMAT, ORC.getInputFormat());
    splitProperties.setProperty(SERIALIZATION_LIB, ORC.getSerde());
    // Use full columns in split properties
    ImmutableList.Builder<String> splitPropertiesColumnNames = ImmutableList.builder();
    ImmutableList.Builder<String> splitPropertiesColumnTypes = ImmutableList.builder();
    Set<String> baseColumnNames = new HashSet<>();
    for (TestColumn columnToRead : columnsToRead) {
        String name = columnToRead.getBaseName();
        if (!baseColumnNames.contains(name) && !columnToRead.isPartitionKey()) {
            baseColumnNames.add(name);
            splitPropertiesColumnNames.add(name);
            splitPropertiesColumnTypes.add(columnToRead.getBaseObjectInspector().getTypeName());
        }
    }
    splitProperties.setProperty("columns", splitPropertiesColumnNames.build().stream().collect(Collectors.joining(",")));
    splitProperties.setProperty("columns.types", splitPropertiesColumnTypes.build().stream().collect(Collectors.joining(",")));
    List<HivePartitionKey> partitionKeys = columnsToRead.stream().filter(TestColumn::isPartitionKey).map(input -> new HivePartitionKey(input.getName(), (String) input.getWriteValue())).collect(toList());
    String partitionName = String.join("/", partitionKeys.stream().map(partitionKey -> format("%s=%s", partitionKey.getName(), partitionKey.getValue())).collect(toImmutableList()));
    List<HiveColumnHandle> columnHandles = getColumnHandles(columnsToRead);
    TupleDomain<HiveColumnHandle> predicate = effectivePredicate.transformKeys(testColumn -> {
        Optional<HiveColumnHandle> handle = columnHandles.stream().filter(column -> testColumn.getName().equals(column.getName())).findFirst();
        checkState(handle.isPresent(), "Predicate on invalid column");
        return handle.get();
    });
    List<HivePageSourceProvider.ColumnMapping> columnMappings = buildColumnMappings(partitionName, partitionKeys, columnHandles, ImmutableList.of(), TableToPartitionMapping.empty(), split.getPath(), OptionalInt.empty(), split.getLength(), Instant.now().toEpochMilli());
    Optional<ConnectorPageSource> pageSource = HivePageSourceProvider.createHivePageSource(ImmutableSet.of(readerFactory), ImmutableSet.of(), new Configuration(false), session, split.getPath(), OptionalInt.empty(), split.getStart(), split.getLength(), split.getLength(), splitProperties, predicate, columnHandles, TESTING_TYPE_MANAGER, Optional.empty(), Optional.empty(), false, Optional.empty(), false, NO_ACID_TRANSACTION, columnMappings);
    assertTrue(pageSource.isPresent());
    return pageSource.get();
}
Also used : HivePageSourceProvider(io.trino.plugin.hive.HivePageSourceProvider) PrimitiveObjectInspectorFactory.javaLongObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaLongObjectInspector) Test(org.testng.annotations.Test) NO_ACID_TRANSACTION(io.trino.plugin.hive.acid.AcidTransaction.NO_ACID_TRANSACTION) FileSplit(org.apache.hadoop.mapred.FileSplit) Configuration(org.apache.hadoop.conf.Configuration) StructuralTestUtil.rowBlockOf(io.trino.testing.StructuralTestUtil.rowBlockOf) AbstractTestHiveFileFormats(io.trino.plugin.hive.AbstractTestHiveFileFormats) ConnectorPageSource(io.trino.spi.connector.ConnectorPageSource) HiveColumnHandle(io.trino.plugin.hive.HiveColumnHandle) HiveCompressionCodec(io.trino.plugin.hive.HiveCompressionCodec) SERIALIZATION_LIB(org.apache.hadoop.hive.serde.serdeConstants.SERIALIZATION_LIB) ImmutableSet(com.google.common.collect.ImmutableSet) FileFormatDataSourceStats(io.trino.plugin.hive.FileFormatDataSourceStats) ImmutableMap(com.google.common.collect.ImmutableMap) Domain(io.trino.spi.predicate.Domain) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) Set(java.util.Set) Instant(java.time.Instant) Collectors(java.util.stream.Collectors) HDFS_ENVIRONMENT(io.trino.plugin.hive.HiveTestUtils.HDFS_ENVIRONMENT) String.format(java.lang.String.format) Preconditions.checkState(com.google.common.base.Preconditions.checkState) List(java.util.List) ColumnMapping.buildColumnMappings(io.trino.plugin.hive.HivePageSourceProvider.ColumnMapping.buildColumnMappings) TableToPartitionMapping(io.trino.plugin.hive.TableToPartitionMapping) BIGINT(io.trino.spi.type.BigintType.BIGINT) Optional(java.util.Optional) PrimitiveObjectInspectorFactory.javaIntObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaIntObjectInspector) Page(io.trino.spi.Page) Assert.assertEquals(org.testng.Assert.assertEquals) OptionalInt(java.util.OptionalInt) HashSet(java.util.HashSet) ImmutableList(com.google.common.collect.ImmutableList) NodeVersion(io.trino.plugin.hive.NodeVersion) OrcReaderOptions(io.trino.orc.OrcReaderOptions) Properties(java.util.Properties) ORC(io.trino.plugin.hive.HiveStorageFormat.ORC) HivePartitionKey(io.trino.plugin.hive.HivePartitionKey) TESTING_TYPE_MANAGER(io.trino.type.InternalTypeManager.TESTING_TYPE_MANAGER) ConnectorSession(io.trino.spi.connector.ConnectorSession) ObjectInspectorFactory.getStandardStructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory.getStandardStructObjectInspector) TupleDomain(io.trino.spi.predicate.TupleDomain) UTC(org.joda.time.DateTimeZone.UTC) File(java.io.File) HiveTestUtils.getHiveSession(io.trino.plugin.hive.HiveTestUtils.getHiveSession) Collectors.toList(java.util.stream.Collectors.toList) OrcWriterOptions(io.trino.orc.OrcWriterOptions) FILE_INPUT_FORMAT(org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.FILE_INPUT_FORMAT) Assert.assertTrue(org.testng.Assert.assertTrue) HiveConfig(io.trino.plugin.hive.HiveConfig) Configuration(org.apache.hadoop.conf.Configuration) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) ImmutableList(com.google.common.collect.ImmutableList) Properties(java.util.Properties) ConnectorPageSource(io.trino.spi.connector.ConnectorPageSource) OrcReaderOptions(io.trino.orc.OrcReaderOptions) HivePartitionKey(io.trino.plugin.hive.HivePartitionKey) HiveColumnHandle(io.trino.plugin.hive.HiveColumnHandle) HashSet(java.util.HashSet)

Example 5 with HDFS_ENVIRONMENT

use of io.trino.plugin.hive.HiveTestUtils.HDFS_ENVIRONMENT in project trino by trinodb.

the class TestHiveFileFormats method testRCBinaryProjectedColumnsPageSource.

@Test(dataProvider = "rowCount")
public void testRCBinaryProjectedColumnsPageSource(int rowCount) throws Exception {
    // RCBinary does not support complex type as key of a map and interprets empty VARCHAR as nulls
    List<TestColumn> supportedColumns = TEST_COLUMNS.stream().filter(testColumn -> !testColumn.getName().equals("t_empty_varchar")).collect(toList());
    List<TestColumn> regularColumns = getRegularColumns(supportedColumns);
    List<TestColumn> partitionColumns = getPartitionColumns(supportedColumns);
    // Created projected columns for all regular supported columns
    ImmutableList.Builder<TestColumn> writeColumnsBuilder = ImmutableList.builder();
    ImmutableList.Builder<TestColumn> readeColumnsBuilder = ImmutableList.builder();
    generateProjectedColumns(regularColumns, writeColumnsBuilder, readeColumnsBuilder);
    List<TestColumn> writeColumns = writeColumnsBuilder.addAll(partitionColumns).build();
    List<TestColumn> readColumns = readeColumnsBuilder.addAll(partitionColumns).build();
    assertThatFileFormat(RCBINARY).withWriteColumns(writeColumns).withReadColumns(readColumns).withRowsCount(rowCount).withFileWriterFactory(new RcFileFileWriterFactory(HDFS_ENVIRONMENT, TESTING_TYPE_MANAGER, new NodeVersion("test"), HIVE_STORAGE_TIME_ZONE, STATS)).isReadableByPageSource(new RcFilePageSourceFactory(TESTING_TYPE_MANAGER, HDFS_ENVIRONMENT, STATS, new HiveConfig()));
}
Also used : OrcFileWriterFactory(io.trino.plugin.hive.orc.OrcFileWriterFactory) ParquetFileWriterFactory(io.trino.plugin.hive.parquet.ParquetFileWriterFactory) Test(org.testng.annotations.Test) NO_ACID_TRANSACTION(io.trino.plugin.hive.acid.AcidTransaction.NO_ACID_TRANSACTION) HiveTestUtils.createGenericHiveRecordCursorProvider(io.trino.plugin.hive.HiveTestUtils.createGenericHiveRecordCursorProvider) TrinoExceptionAssert.assertTrinoExceptionThrownBy(io.trino.testing.assertions.TrinoExceptionAssert.assertTrinoExceptionThrownBy) PARQUET(io.trino.plugin.hive.HiveStorageFormat.PARQUET) FileSplit(org.apache.hadoop.mapred.FileSplit) Locale(java.util.Locale) Configuration(org.apache.hadoop.conf.Configuration) StructuralTestUtil.rowBlockOf(io.trino.testing.StructuralTestUtil.rowBlockOf) Slices.utf8Slice(io.airlift.slice.Slices.utf8Slice) ConnectorPageSource(io.trino.spi.connector.ConnectorPageSource) ObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector) AVRO(io.trino.plugin.hive.HiveStorageFormat.AVRO) SERIALIZATION_LIB(org.apache.hadoop.hive.serde.serdeConstants.SERIALIZATION_LIB) LzoCodec(io.airlift.compress.lzo.LzoCodec) ImmutableSet(com.google.common.collect.ImmutableSet) TimeZone(java.util.TimeZone) MapObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector) BeforeClass(org.testng.annotations.BeforeClass) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) Set(java.util.Set) Assert.assertNotNull(org.testng.Assert.assertNotNull) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector) Instant(java.time.Instant) Collectors(java.util.stream.Collectors) HDFS_ENVIRONMENT(io.trino.plugin.hive.HiveTestUtils.HDFS_ENVIRONMENT) String.format(java.lang.String.format) Preconditions.checkState(com.google.common.base.Preconditions.checkState) List(java.util.List) ColumnMapping.buildColumnMappings(io.trino.plugin.hive.HivePageSourceProvider.ColumnMapping.buildColumnMappings) OrcReaderConfig(io.trino.plugin.hive.orc.OrcReaderConfig) VarcharTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.VarcharTypeInfo) Optional(java.util.Optional) ParquetReaderConfig(io.trino.plugin.hive.parquet.ParquetReaderConfig) StructField(org.apache.hadoop.hive.serde2.objectinspector.StructField) RcFilePageSourceFactory(io.trino.plugin.hive.rcfile.RcFilePageSourceFactory) ListObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector) DataProvider(org.testng.annotations.DataProvider) PrimitiveCategory(org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory) Type(io.trino.spi.type.Type) Assert.assertEquals(org.testng.Assert.assertEquals) CSV(io.trino.plugin.hive.HiveStorageFormat.CSV) OptionalInt(java.util.OptionalInt) PrimitiveObjectInspectorFactory.getPrimitiveJavaObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.getPrimitiveJavaObjectInspector) LzopCodec(io.airlift.compress.lzo.LzopCodec) SymlinkTextInputFormat(org.apache.hadoop.hive.ql.io.SymlinkTextInputFormat) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) ParquetPageSourceFactory(io.trino.plugin.hive.parquet.ParquetPageSourceFactory) HiveVarchar(org.apache.hadoop.hive.common.type.HiveVarchar) ParquetWriterConfig(io.trino.plugin.hive.parquet.ParquetWriterConfig) Lists(com.google.common.collect.Lists) ImmutableList(com.google.common.collect.ImmutableList) PrimitiveObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector) SEQUENCEFILE(io.trino.plugin.hive.HiveStorageFormat.SEQUENCEFILE) OrcReaderOptions(io.trino.orc.OrcReaderOptions) OrcPageSourceFactory(io.trino.plugin.hive.orc.OrcPageSourceFactory) RecordPageSource(io.trino.spi.connector.RecordPageSource) Objects.requireNonNull(java.util.Objects.requireNonNull) TEXTFILE(io.trino.plugin.hive.HiveStorageFormat.TEXTFILE) JSON(io.trino.plugin.hive.HiveStorageFormat.JSON) OrcWriterConfig(io.trino.plugin.hive.orc.OrcWriterConfig) RCBINARY(io.trino.plugin.hive.HiveStorageFormat.RCBINARY) RecordCursor(io.trino.spi.connector.RecordCursor) Properties(java.util.Properties) ORC(io.trino.plugin.hive.HiveStorageFormat.ORC) HiveTestUtils.getTypes(io.trino.plugin.hive.HiveTestUtils.getTypes) TESTING_TYPE_MANAGER(io.trino.type.InternalTypeManager.TESTING_TYPE_MANAGER) IOException(java.io.IOException) ConnectorSession(io.trino.spi.connector.ConnectorSession) ObjectInspectorFactory.getStandardStructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory.getStandardStructObjectInspector) TupleDomain(io.trino.spi.predicate.TupleDomain) UTC(org.joda.time.DateTimeZone.UTC) File(java.io.File) TestingConnectorSession(io.trino.testing.TestingConnectorSession) SESSION(io.trino.plugin.hive.HiveTestUtils.SESSION) HiveTestUtils.getHiveSession(io.trino.plugin.hive.HiveTestUtils.getHiveSession) Collectors.toList(java.util.stream.Collectors.toList) OrcWriterOptions(io.trino.orc.OrcWriterOptions) RCTEXT(io.trino.plugin.hive.HiveStorageFormat.RCTEXT) FILE_INPUT_FORMAT(org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.FILE_INPUT_FORMAT) Assert.assertTrue(org.testng.Assert.assertTrue) PrimitiveObjectInspectorFactory.javaStringObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaStringObjectInspector) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) ImmutableList(com.google.common.collect.ImmutableList) RcFilePageSourceFactory(io.trino.plugin.hive.rcfile.RcFilePageSourceFactory) Test(org.testng.annotations.Test)

Aggregations

ImmutableList (com.google.common.collect.ImmutableList)8 ImmutableList.toImmutableList (com.google.common.collect.ImmutableList.toImmutableList)8 HDFS_ENVIRONMENT (io.trino.plugin.hive.HiveTestUtils.HDFS_ENVIRONMENT)8 HiveTestUtils.getHiveSession (io.trino.plugin.hive.HiveTestUtils.getHiveSession)8 NO_ACID_TRANSACTION (io.trino.plugin.hive.acid.AcidTransaction.NO_ACID_TRANSACTION)8 ConnectorSession (io.trino.spi.connector.ConnectorSession)8 TESTING_TYPE_MANAGER (io.trino.type.InternalTypeManager.TESTING_TYPE_MANAGER)8 File (java.io.File)8 String.format (java.lang.String.format)8 List (java.util.List)8 Optional (java.util.Optional)8 OptionalInt (java.util.OptionalInt)8 Properties (java.util.Properties)8 FILE_INPUT_FORMAT (org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.FILE_INPUT_FORMAT)8 SERIALIZATION_LIB (org.apache.hadoop.hive.serde.serdeConstants.SERIALIZATION_LIB)8 Assert.assertTrue (org.testng.Assert.assertTrue)8 Test (org.testng.annotations.Test)8 ImmutableSet (com.google.common.collect.ImmutableSet)7 TupleDomain (io.trino.spi.predicate.TupleDomain)7 ArrayList (java.util.ArrayList)7