Search in sources :

Example 1 with ParquetPageSourceFactory

use of io.prestosql.plugin.hive.parquet.ParquetPageSourceFactory in project hetu-core by openlookeng.

the class TestHiveFileFormats method testTruncateVarcharColumn.

@Test
public void testTruncateVarcharColumn() throws Exception {
    TestColumn writeColumn = new TestColumn("varchar_column", getPrimitiveJavaObjectInspector(new VarcharTypeInfo(4)), new HiveVarchar("test", 4), utf8Slice("test"));
    TestColumn readColumn = new TestColumn("varchar_column", getPrimitiveJavaObjectInspector(new VarcharTypeInfo(3)), new HiveVarchar("tes", 3), utf8Slice("tes"));
    assertThatFileFormat(RCTEXT).withWriteColumns(ImmutableList.of(writeColumn)).withReadColumns(ImmutableList.of(readColumn)).isReadableByPageSource(new RcFilePageSourceFactory(TYPE_MANAGER, HDFS_ENVIRONMENT, STATS, new HiveConfig())).isReadableByRecordCursor(new GenericHiveRecordCursorProvider(HDFS_ENVIRONMENT));
    assertThatFileFormat(RCBINARY).withWriteColumns(ImmutableList.of(writeColumn)).withReadColumns(ImmutableList.of(readColumn)).isReadableByPageSource(new RcFilePageSourceFactory(TYPE_MANAGER, HDFS_ENVIRONMENT, STATS, new HiveConfig())).isReadableByRecordCursor(new GenericHiveRecordCursorProvider(HDFS_ENVIRONMENT));
    assertThatFileFormat(ORC).withWriteColumns(ImmutableList.of(writeColumn)).withReadColumns(ImmutableList.of(readColumn)).isReadableByPageSource(new OrcPageSourceFactory(TYPE_MANAGER, new HiveConfig().setUseOrcColumnNames(false), HDFS_ENVIRONMENT, STATS, OrcCacheStore.builder().newCacheStore(new HiveConfig().getOrcFileTailCacheLimit(), Duration.ofMillis(new HiveConfig().getOrcFileTailCacheTtl().toMillis()), new HiveConfig().getOrcStripeFooterCacheLimit(), Duration.ofMillis(new HiveConfig().getOrcStripeFooterCacheTtl().toMillis()), new HiveConfig().getOrcRowIndexCacheLimit(), Duration.ofMillis(new HiveConfig().getOrcRowIndexCacheTtl().toMillis()), new HiveConfig().getOrcBloomFiltersCacheLimit(), Duration.ofMillis(new HiveConfig().getOrcBloomFiltersCacheTtl().toMillis()), new HiveConfig().getOrcRowDataCacheMaximumWeight(), Duration.ofMillis(new HiveConfig().getOrcRowDataCacheTtl().toMillis()), new HiveConfig().isOrcCacheStatsMetricCollectionEnabled())));
    assertThatFileFormat(PARQUET).withWriteColumns(ImmutableList.of(writeColumn)).withReadColumns(ImmutableList.of(readColumn)).withSession(parquetPageSourceSession).isReadableByPageSource(new ParquetPageSourceFactory(TYPE_MANAGER, HDFS_ENVIRONMENT, STATS, new HiveConfig()));
    assertThatFileFormat(AVRO).withWriteColumns(ImmutableList.of(writeColumn)).withReadColumns(ImmutableList.of(readColumn)).isReadableByRecordCursor(new GenericHiveRecordCursorProvider(HDFS_ENVIRONMENT));
    assertThatFileFormat(SEQUENCEFILE).withWriteColumns(ImmutableList.of(writeColumn)).withReadColumns(ImmutableList.of(readColumn)).isReadableByRecordCursor(new GenericHiveRecordCursorProvider(HDFS_ENVIRONMENT));
    assertThatFileFormat(TEXTFILE).withWriteColumns(ImmutableList.of(writeColumn)).withReadColumns(ImmutableList.of(readColumn)).isReadableByRecordCursor(new GenericHiveRecordCursorProvider(HDFS_ENVIRONMENT));
}
Also used : VarcharTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.VarcharTypeInfo) HiveTestUtils.createGenericHiveRecordCursorProvider(io.prestosql.plugin.hive.HiveTestUtils.createGenericHiveRecordCursorProvider) HiveVarchar(org.apache.hadoop.hive.common.type.HiveVarchar) OrcPageSourceFactory(io.prestosql.plugin.hive.orc.OrcPageSourceFactory) RcFilePageSourceFactory(io.prestosql.plugin.hive.rcfile.RcFilePageSourceFactory) ParquetPageSourceFactory(io.prestosql.plugin.hive.parquet.ParquetPageSourceFactory) Test(org.testng.annotations.Test)

Example 2 with ParquetPageSourceFactory

use of io.prestosql.plugin.hive.parquet.ParquetPageSourceFactory in project hetu-core by openlookeng.

the class TestHiveFileFormats method testFailForLongVarcharPartitionColumn.

@Test
public void testFailForLongVarcharPartitionColumn() throws Exception {
    TestColumn partitionColumn = new TestColumn("partition_column", getPrimitiveJavaObjectInspector(new VarcharTypeInfo(3)), "test", utf8Slice("tes"), true);
    TestColumn varcharColumn = new TestColumn("varchar_column", getPrimitiveJavaObjectInspector(new VarcharTypeInfo(3)), new HiveVarchar("tes", 3), utf8Slice("tes"));
    List<TestColumn> columns = ImmutableList.of(partitionColumn, varcharColumn);
    HiveErrorCode expectedErrorCode = HiveErrorCode.HIVE_INVALID_PARTITION_VALUE;
    String expectedMessage = "Invalid partition value 'test' for varchar(3) partition key: partition_column";
    assertThatFileFormat(RCTEXT).withColumns(columns).isFailingForPageSource(new RcFilePageSourceFactory(TYPE_MANAGER, HDFS_ENVIRONMENT, STATS, new HiveConfig()), expectedErrorCode, expectedMessage).isFailingForRecordCursor(new GenericHiveRecordCursorProvider(HDFS_ENVIRONMENT), expectedErrorCode, expectedMessage);
    assertThatFileFormat(RCBINARY).withColumns(columns).isFailingForPageSource(new RcFilePageSourceFactory(TYPE_MANAGER, HDFS_ENVIRONMENT, STATS, new HiveConfig()), expectedErrorCode, expectedMessage).isFailingForRecordCursor(new GenericHiveRecordCursorProvider(HDFS_ENVIRONMENT), expectedErrorCode, expectedMessage);
    assertThatFileFormat(ORC).withColumns(columns).isFailingForPageSource(new OrcPageSourceFactory(TYPE_MANAGER, new HiveConfig().setUseOrcColumnNames(false), HDFS_ENVIRONMENT, STATS, OrcCacheStore.builder().newCacheStore(new HiveConfig().getOrcFileTailCacheLimit(), Duration.ofMillis(new HiveConfig().getOrcFileTailCacheTtl().toMillis()), new HiveConfig().getOrcStripeFooterCacheLimit(), Duration.ofMillis(new HiveConfig().getOrcStripeFooterCacheTtl().toMillis()), new HiveConfig().getOrcRowIndexCacheLimit(), Duration.ofMillis(new HiveConfig().getOrcRowIndexCacheTtl().toMillis()), new HiveConfig().getOrcBloomFiltersCacheLimit(), Duration.ofMillis(new HiveConfig().getOrcBloomFiltersCacheTtl().toMillis()), new HiveConfig().getOrcRowDataCacheMaximumWeight(), Duration.ofMillis(new HiveConfig().getOrcRowDataCacheTtl().toMillis()), new HiveConfig().isOrcCacheStatsMetricCollectionEnabled())), expectedErrorCode, expectedMessage);
    assertThatFileFormat(PARQUET).withColumns(columns).withSession(parquetPageSourceSession).isFailingForPageSource(new ParquetPageSourceFactory(TYPE_MANAGER, HDFS_ENVIRONMENT, STATS, new HiveConfig()), expectedErrorCode, expectedMessage);
    assertThatFileFormat(SEQUENCEFILE).withColumns(columns).isFailingForRecordCursor(new GenericHiveRecordCursorProvider(HDFS_ENVIRONMENT), expectedErrorCode, expectedMessage);
    assertThatFileFormat(TEXTFILE).withColumns(columns).isFailingForRecordCursor(new GenericHiveRecordCursorProvider(HDFS_ENVIRONMENT), expectedErrorCode, expectedMessage);
}
Also used : VarcharTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.VarcharTypeInfo) HiveTestUtils.createGenericHiveRecordCursorProvider(io.prestosql.plugin.hive.HiveTestUtils.createGenericHiveRecordCursorProvider) HiveVarchar(org.apache.hadoop.hive.common.type.HiveVarchar) OrcPageSourceFactory(io.prestosql.plugin.hive.orc.OrcPageSourceFactory) RcFilePageSourceFactory(io.prestosql.plugin.hive.rcfile.RcFilePageSourceFactory) ParquetPageSourceFactory(io.prestosql.plugin.hive.parquet.ParquetPageSourceFactory) Test(org.testng.annotations.Test)

Example 3 with ParquetPageSourceFactory

use of io.prestosql.plugin.hive.parquet.ParquetPageSourceFactory in project boostkit-bigdata by kunpengcompute.

the class HiveTestUtils method getDefaultHiveDataStreamFactories.

public static Set<HivePageSourceFactory> getDefaultHiveDataStreamFactories(HiveConfig hiveConfig) {
    FileFormatDataSourceStats stats = new FileFormatDataSourceStats();
    HdfsEnvironment testHdfsEnvironment = createTestHdfsEnvironment(hiveConfig);
    return ImmutableSet.<HivePageSourceFactory>builder().add(new RcFilePageSourceFactory(TYPE_MANAGER, testHdfsEnvironment, stats, hiveConfig)).add(new OrcPageSourceFactory(TYPE_MANAGER, hiveConfig, testHdfsEnvironment, stats, OrcCacheStore.builder().newCacheStore(new HiveConfig().getOrcFileTailCacheLimit(), Duration.ofMillis(new HiveConfig().getOrcFileTailCacheTtl().toMillis()), new HiveConfig().getOrcStripeFooterCacheLimit(), Duration.ofMillis(new HiveConfig().getOrcStripeFooterCacheTtl().toMillis()), new HiveConfig().getOrcRowIndexCacheLimit(), Duration.ofMillis(new HiveConfig().getOrcRowIndexCacheTtl().toMillis()), new HiveConfig().getOrcBloomFiltersCacheLimit(), Duration.ofMillis(new HiveConfig().getOrcBloomFiltersCacheTtl().toMillis()), new HiveConfig().getOrcRowDataCacheMaximumWeight(), Duration.ofMillis(new HiveConfig().getOrcRowDataCacheTtl().toMillis()), new HiveConfig().isOrcCacheStatsMetricCollectionEnabled()))).add(new ParquetPageSourceFactory(TYPE_MANAGER, testHdfsEnvironment, stats, new HiveConfig())).build();
}
Also used : OrcPageSourceFactory(io.prestosql.plugin.hive.orc.OrcPageSourceFactory) RcFilePageSourceFactory(io.prestosql.plugin.hive.rcfile.RcFilePageSourceFactory) ParquetPageSourceFactory(io.prestosql.plugin.hive.parquet.ParquetPageSourceFactory)

Example 4 with ParquetPageSourceFactory

use of io.prestosql.plugin.hive.parquet.ParquetPageSourceFactory in project boostkit-bigdata by kunpengcompute.

the class TestHiveFileFormats method testParquetPageSourceSchemaEvolution.

@Test(dataProvider = "rowCount")
public void testParquetPageSourceSchemaEvolution(int rowCount) throws Exception {
    List<TestColumn> writeColumns = getTestColumnsSupportedByParquet();
    // test index-based access
    List<TestColumn> readColumns = writeColumns.stream().map(column -> new TestColumn(column.getName() + "_new", column.getObjectInspector(), column.getWriteValue(), column.getExpectedValue(), column.isPartitionKey())).collect(toList());
    assertThatFileFormat(PARQUET).withWriteColumns(writeColumns).withReadColumns(readColumns).withSession(parquetPageSourceSession).withRowsCount(rowCount).isReadableByPageSource(new ParquetPageSourceFactory(HiveTestUtils.TYPE_MANAGER, HiveTestUtils.HDFS_ENVIRONMENT, STATS, new HiveConfig()));
    // test name-based access
    readColumns = Lists.reverse(writeColumns);
    assertThatFileFormat(PARQUET).withWriteColumns(writeColumns).withReadColumns(readColumns).withSession(parquetPageSourceSessionUseName).isReadableByPageSource(new ParquetPageSourceFactory(HiveTestUtils.TYPE_MANAGER, HiveTestUtils.HDFS_ENVIRONMENT, STATS, new HiveConfig()));
}
Also used : Iterables.transform(com.google.common.collect.Iterables.transform) RCBINARY(io.prestosql.plugin.hive.HiveStorageFormat.RCBINARY) Test(org.testng.annotations.Test) TEXTFILE(io.prestosql.plugin.hive.HiveStorageFormat.TEXTFILE) FileSplit(org.apache.hadoop.mapred.FileSplit) ConnectorSession(io.prestosql.spi.connector.ConnectorSession) Predicates.not(com.google.common.base.Predicates.not) Locale(java.util.Locale) Slices(io.airlift.slice.Slices) Configuration(org.apache.hadoop.conf.Configuration) Duration(java.time.Duration) Slices.utf8Slice(io.airlift.slice.Slices.utf8Slice) ObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector) JSON(io.prestosql.plugin.hive.HiveStorageFormat.JSON) SERIALIZATION_LIB(org.apache.hadoop.hive.serde.serdeConstants.SERIALIZATION_LIB) LzoCodec(io.airlift.compress.lzo.LzoCodec) PrestoException(io.prestosql.spi.PrestoException) ImmutableSet(com.google.common.collect.ImmutableSet) ImmutableMap(com.google.common.collect.ImmutableMap) TimeZone(java.util.TimeZone) MapObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector) BeforeClass(org.testng.annotations.BeforeClass) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) Assert.assertNotNull(org.testng.Assert.assertNotNull) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector) RCTEXT(io.prestosql.plugin.hive.HiveStorageFormat.RCTEXT) List(java.util.List) ConnectorPageSource(io.prestosql.spi.connector.ConnectorPageSource) VarcharTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.VarcharTypeInfo) ParquetPageSourceFactory(io.prestosql.plugin.hive.parquet.ParquetPageSourceFactory) HDFS_ENVIRONMENT(io.prestosql.plugin.hive.HiveTestUtils.HDFS_ENVIRONMENT) Optional(java.util.Optional) Iterables.filter(com.google.common.collect.Iterables.filter) ORC(io.prestosql.plugin.hive.HiveStorageFormat.ORC) Joiner(com.google.common.base.Joiner) StructField(org.apache.hadoop.hive.serde2.objectinspector.StructField) ListObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector) DataProvider(org.testng.annotations.DataProvider) Logger(io.airlift.log.Logger) PrimitiveCategory(org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory) TYPE_MANAGER(io.prestosql.plugin.hive.HiveTestUtils.TYPE_MANAGER) Assert.assertEquals(org.testng.Assert.assertEquals) RcFilePageSourceFactory(io.prestosql.plugin.hive.rcfile.RcFilePageSourceFactory) OptionalInt(java.util.OptionalInt) PrimitiveObjectInspectorFactory.getPrimitiveJavaObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.getPrimitiveJavaObjectInspector) LzopCodec(io.airlift.compress.lzo.LzopCodec) HiveVarchar(org.apache.hadoop.hive.common.type.HiveVarchar) Lists(com.google.common.collect.Lists) ImmutableList(com.google.common.collect.ImmutableList) PrimitiveObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector) Objects.requireNonNull(java.util.Objects.requireNonNull) OrcPageSourceFactory(io.prestosql.plugin.hive.orc.OrcPageSourceFactory) AVRO(io.prestosql.plugin.hive.HiveStorageFormat.AVRO) SEQUENCEFILE(io.prestosql.plugin.hive.HiveStorageFormat.SEQUENCEFILE) RecordCursor(io.prestosql.spi.connector.RecordCursor) PARQUET(io.prestosql.plugin.hive.HiveStorageFormat.PARQUET) Properties(java.util.Properties) TupleDomain(io.prestosql.spi.predicate.TupleDomain) CSV(io.prestosql.plugin.hive.HiveStorageFormat.CSV) Assert.fail(org.testng.Assert.fail) IOException(java.io.IOException) File(java.io.File) OrcWriterOptions(io.prestosql.orc.OrcWriterOptions) HiveTestUtils.createGenericHiveRecordCursorProvider(io.prestosql.plugin.hive.HiveTestUtils.createGenericHiveRecordCursorProvider) Collectors.toList(java.util.stream.Collectors.toList) OrcCacheStore(io.prestosql.orc.OrcCacheStore) FILE_INPUT_FORMAT(org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.FILE_INPUT_FORMAT) Assert.assertTrue(org.testng.Assert.assertTrue) TestingConnectorSession(io.prestosql.testing.TestingConnectorSession) PrimitiveObjectInspectorFactory.javaStringObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaStringObjectInspector) RecordPageSource(io.prestosql.spi.connector.RecordPageSource) ParquetPageSourceFactory(io.prestosql.plugin.hive.parquet.ParquetPageSourceFactory) Test(org.testng.annotations.Test)

Example 5 with ParquetPageSourceFactory

use of io.prestosql.plugin.hive.parquet.ParquetPageSourceFactory in project hetu-core by openlookeng.

the class HiveTestUtils method getDefaultHiveDataStreamFactories.

public static Set<HivePageSourceFactory> getDefaultHiveDataStreamFactories(HiveConfig hiveConfig) {
    FileFormatDataSourceStats stats = new FileFormatDataSourceStats();
    HdfsEnvironment testHdfsEnvironment = createTestHdfsEnvironment(hiveConfig);
    return ImmutableSet.<HivePageSourceFactory>builder().add(new RcFilePageSourceFactory(TYPE_MANAGER, testHdfsEnvironment, stats, hiveConfig)).add(new OrcPageSourceFactory(TYPE_MANAGER, hiveConfig, testHdfsEnvironment, stats, OrcCacheStore.builder().newCacheStore(new HiveConfig().getOrcFileTailCacheLimit(), Duration.ofMillis(new HiveConfig().getOrcFileTailCacheTtl().toMillis()), new HiveConfig().getOrcStripeFooterCacheLimit(), Duration.ofMillis(new HiveConfig().getOrcStripeFooterCacheTtl().toMillis()), new HiveConfig().getOrcRowIndexCacheLimit(), Duration.ofMillis(new HiveConfig().getOrcRowIndexCacheTtl().toMillis()), new HiveConfig().getOrcBloomFiltersCacheLimit(), Duration.ofMillis(new HiveConfig().getOrcBloomFiltersCacheTtl().toMillis()), new HiveConfig().getOrcRowDataCacheMaximumWeight(), Duration.ofMillis(new HiveConfig().getOrcRowDataCacheTtl().toMillis()), new HiveConfig().isOrcCacheStatsMetricCollectionEnabled()))).add(new ParquetPageSourceFactory(TYPE_MANAGER, testHdfsEnvironment, stats, hiveConfig)).build();
}
Also used : OrcPageSourceFactory(io.prestosql.plugin.hive.orc.OrcPageSourceFactory) RcFilePageSourceFactory(io.prestosql.plugin.hive.rcfile.RcFilePageSourceFactory) ParquetPageSourceFactory(io.prestosql.plugin.hive.parquet.ParquetPageSourceFactory)

Aggregations

ParquetPageSourceFactory (io.prestosql.plugin.hive.parquet.ParquetPageSourceFactory)10 OrcPageSourceFactory (io.prestosql.plugin.hive.orc.OrcPageSourceFactory)8 RcFilePageSourceFactory (io.prestosql.plugin.hive.rcfile.RcFilePageSourceFactory)8 Test (org.testng.annotations.Test)8 HiveTestUtils.createGenericHiveRecordCursorProvider (io.prestosql.plugin.hive.HiveTestUtils.createGenericHiveRecordCursorProvider)6 HiveVarchar (org.apache.hadoop.hive.common.type.HiveVarchar)6 VarcharTypeInfo (org.apache.hadoop.hive.serde2.typeinfo.VarcharTypeInfo)6 Joiner (com.google.common.base.Joiner)2 Predicates.not (com.google.common.base.Predicates.not)2 ImmutableList (com.google.common.collect.ImmutableList)2 ImmutableList.toImmutableList (com.google.common.collect.ImmutableList.toImmutableList)2 ImmutableMap (com.google.common.collect.ImmutableMap)2 ImmutableSet (com.google.common.collect.ImmutableSet)2 Iterables.filter (com.google.common.collect.Iterables.filter)2 Iterables.transform (com.google.common.collect.Iterables.transform)2 Lists (com.google.common.collect.Lists)2 LzoCodec (io.airlift.compress.lzo.LzoCodec)2 LzopCodec (io.airlift.compress.lzo.LzopCodec)2 Logger (io.airlift.log.Logger)2 Slices (io.airlift.slice.Slices)2