use of com.facebook.presto.hive.metastore.Storage in project presto by prestodb.
the class TestHivePageSourceProvider method testUseRecordReaderWithInputFormatAnnotationAndCustomSplit.
@Test
public void testUseRecordReaderWithInputFormatAnnotationAndCustomSplit() {
StorageFormat storageFormat = StorageFormat.create(ParquetHiveSerDe.class.getName(), HoodieParquetInputFormat.class.getName(), "");
Storage storage = new Storage(storageFormat, "test", Optional.empty(), true, ImmutableMap.of(), ImmutableMap.of());
Map<String, String> customSplitInfo = ImmutableMap.of(CUSTOM_FILE_SPLIT_CLASS_KEY, HoodieRealtimeFileSplit.class.getName(), HUDI_BASEPATH_KEY, "/test/file.parquet", HUDI_DELTA_FILEPATHS_KEY, "/test/.file_100.log", HUDI_MAX_COMMIT_TIME_KEY, "100");
HiveRecordCursorProvider recordCursorProvider = new MockHiveRecordCursorProvider();
HiveBatchPageSourceFactory hiveBatchPageSourceFactory = new MockHiveBatchPageSourceFactory();
Optional<ConnectorPageSource> pageSource = HivePageSourceProvider.createHivePageSource(ImmutableSet.of(recordCursorProvider), ImmutableSet.of(hiveBatchPageSourceFactory), new Configuration(), new TestingConnectorSession(new HiveSessionProperties(new HiveClientConfig().setUseRecordPageSourceForCustomSplit(true), new OrcFileWriterConfig(), new ParquetFileWriterConfig(), new CacheConfig()).getSessionProperties()), new Path("/test/"), OptionalInt.empty(), 0, 100, 200, Instant.now().toEpochMilli(), storage, TupleDomain.none(), ImmutableList.of(), ImmutableMap.of(), ImmutableList.of(), DateTimeZone.UTC, new TestingTypeManager(), new SchemaTableName("test", "test"), ImmutableList.of(), ImmutableList.of(), ImmutableMap.of(), 0, TableToPartitionMapping.empty(), Optional.empty(), false, null, null, false, null, Optional.empty(), customSplitInfo);
assertTrue(pageSource.isPresent());
assertTrue(pageSource.get() instanceof RecordPageSource);
}
use of com.facebook.presto.hive.metastore.Storage in project presto by prestodb.
the class TestHiveSplitManager method assertRedundantColumnDomains.
private void assertRedundantColumnDomains(Range predicateRange, PartitionStatistics partitionStatistics, List<Set<ColumnHandle>> expectedRedundantColumnDomains, HiveColumnHandle columnHandle) throws Exception {
// Prepare query predicate tuple domain
TupleDomain<ColumnHandle> queryTupleDomain = TupleDomain.fromColumnDomains(Optional.of(ImmutableList.of(new ColumnDomain<>(columnHandle, Domain.create(SortedRangeSet.copyOf(predicateRange.getType(), ImmutableList.of(predicateRange)), false)))));
// Prepare partition with stats
PartitionWithStatistics partitionWithStatistics = new PartitionWithStatistics(new Partition("test_db", "test_table", ImmutableList.of(PARTITION_VALUE), new Storage(fromHiveStorageFormat(ORC), "location", Optional.empty(), true, ImmutableMap.of(), ImmutableMap.of()), COLUMNS, ImmutableMap.of(), Optional.empty(), false, true, 0), PARTITION_NAME, partitionStatistics);
HiveClientConfig hiveClientConfig = new HiveClientConfig().setPartitionStatisticsBasedOptimizationEnabled(true);
HdfsEnvironment hdfsEnvironment = new HdfsEnvironment(new HiveHdfsConfiguration(new HdfsConfigurationInitializer(hiveClientConfig, new MetastoreClientConfig()), ImmutableSet.of()), new MetastoreClientConfig(), new NoHdfsAuthentication());
HiveMetadataFactory metadataFactory = new HiveMetadataFactory(new TestingExtendedHiveMetastore(TEST_TABLE, partitionWithStatistics), hdfsEnvironment, new HivePartitionManager(FUNCTION_AND_TYPE_MANAGER, hiveClientConfig), DateTimeZone.forOffsetHours(1), true, false, false, false, true, true, hiveClientConfig.getMaxPartitionBatchSize(), hiveClientConfig.getMaxPartitionsPerScan(), false, FUNCTION_AND_TYPE_MANAGER, new HiveLocationService(hdfsEnvironment), FUNCTION_RESOLUTION, ROW_EXPRESSION_SERVICE, FILTER_STATS_CALCULATOR_SERVICE, new TableParameterCodec(), HiveTestUtils.PARTITION_UPDATE_CODEC, HiveTestUtils.PARTITION_UPDATE_SMILE_CODEC, executor, new HiveTypeTranslator(), new HiveStagingFileCommitter(hdfsEnvironment, executor), new HiveZeroRowFileCreator(hdfsEnvironment, new OutputStreamDataSinkFactory(), executor), TEST_SERVER_VERSION, new HivePartitionObjectBuilder(), new HiveEncryptionInformationProvider(ImmutableList.of()), new HivePartitionStats(), new HiveFileRenamer(), HiveColumnConverterProvider.DEFAULT_COLUMN_CONVERTER_PROVIDER);
HiveSplitManager splitManager = new HiveSplitManager(new TestingHiveTransactionManager(metadataFactory), new NamenodeStats(), hdfsEnvironment, new TestingDirectoryLister(), directExecutor(), new HiveCoercionPolicy(FUNCTION_AND_TYPE_MANAGER), new CounterStat(), 100, hiveClientConfig.getMaxOutstandingSplitsSize(), hiveClientConfig.getMinPartitionBatchSize(), hiveClientConfig.getMaxPartitionBatchSize(), hiveClientConfig.getSplitLoaderConcurrency(), false, new ConfigBasedCacheQuotaRequirementProvider(new CacheConfig()), new HiveEncryptionInformationProvider(ImmutableList.of()));
HiveColumnHandle partitionColumn = new HiveColumnHandle("ds", HIVE_STRING, parseTypeSignature(VARCHAR), MAX_PARTITION_KEY_COLUMN_INDEX, PARTITION_KEY, Optional.empty(), Optional.empty());
List<HivePartition> partitions = ImmutableList.of(new HivePartition(new SchemaTableName("test_schema", "test_table"), PARTITION_NAME, ImmutableMap.of(partitionColumn, NullableValue.of(createUnboundedVarcharType(), utf8Slice(PARTITION_VALUE)))));
TupleDomain<Subfield> domainPredicate = queryTupleDomain.transform(HiveColumnHandle.class::cast).transform(column -> new Subfield(column.getName(), ImmutableList.of()));
ConnectorSplitSource splitSource = splitManager.getSplits(new HiveTransactionHandle(), new TestingConnectorSession(new HiveSessionProperties(hiveClientConfig, new OrcFileWriterConfig(), new ParquetFileWriterConfig(), new CacheConfig()).getSessionProperties()), new HiveTableLayoutHandle(new SchemaTableName("test_schema", "test_table"), "test_path", ImmutableList.of(partitionColumn), COLUMNS, ImmutableMap.of(), partitions, domainPredicate, TRUE_CONSTANT, ImmutableMap.of(partitionColumn.getName(), partitionColumn, columnHandle.getName(), columnHandle), queryTupleDomain, Optional.empty(), Optional.empty(), false, "layout", Optional.empty(), false), SPLIT_SCHEDULING_CONTEXT);
List<Set<ColumnHandle>> actualRedundantColumnDomains = splitSource.getNextBatch(NOT_PARTITIONED, 100).get().getSplits().stream().map(HiveSplit.class::cast).map(HiveSplit::getRedundantColumnDomains).collect(toImmutableList());
assertEquals(actualRedundantColumnDomains, expectedRedundantColumnDomains);
}
use of com.facebook.presto.hive.metastore.Storage in project presto by prestodb.
the class TestHiveSplit method testJsonRoundTrip.
@Test
public void testJsonRoundTrip() throws Exception {
ImmutableList<HivePartitionKey> partitionKeys = ImmutableList.of(new HivePartitionKey("a", Optional.of("apple")), new HivePartitionKey("b", Optional.of("42")));
ImmutableList<HostAddress> addresses = ImmutableList.of(HostAddress.fromParts("127.0.0.1", 44), HostAddress.fromParts("127.0.0.1", 45));
Map<String, String> customSplitInfo = ImmutableMap.of("key", "value");
Set<ColumnHandle> redundantColumnDomains = ImmutableSet.of(new HiveColumnHandle("test_column", HIVE_LONG, HIVE_LONG.getTypeSignature(), 5, REGULAR, Optional.empty(), ImmutableList.of(), Optional.empty()));
HiveSplit expected = new HiveSplit("db", "table", "partitionId", "path", 42, 87, 88, Instant.now().toEpochMilli(), new Storage(StorageFormat.create("serde", "input", "output"), "location", Optional.empty(), false, ImmutableMap.of(), ImmutableMap.of()), partitionKeys, addresses, OptionalInt.empty(), OptionalInt.empty(), NO_PREFERENCE, 10, TableToPartitionMapping.mapColumnsByIndex(ImmutableMap.of(1, new Column("name", HIVE_STRING, Optional.empty(), Optional.empty()))), Optional.of(new HiveSplit.BucketConversion(32, 16, ImmutableList.of(new HiveColumnHandle("col", HIVE_LONG, BIGINT.getTypeSignature(), 5, REGULAR, Optional.of("comment"), Optional.empty())))), false, Optional.empty(), NO_CACHE_REQUIREMENT, Optional.of(EncryptionInformation.fromEncryptionMetadata(DwrfEncryptionMetadata.forPerField(ImmutableMap.of("field1", "test1".getBytes()), ImmutableMap.of(), "test_algo", "test_provider"))), customSplitInfo, redundantColumnDomains, // some non-standard value
SplitWeight.fromProportion(2.0));
JsonCodec<HiveSplit> codec = getJsonCodec();
String json = codec.toJson(expected);
HiveSplit actual = codec.fromJson(json);
assertEquals(actual.getDatabase(), expected.getDatabase());
assertEquals(actual.getTable(), expected.getTable());
assertEquals(actual.getPartitionName(), expected.getPartitionName());
assertEquals(actual.getPath(), expected.getPath());
assertEquals(actual.getStart(), expected.getStart());
assertEquals(actual.getLength(), expected.getLength());
assertEquals(actual.getFileSize(), expected.getFileSize());
assertEquals(actual.getStorage(), expected.getStorage());
assertEquals(actual.getPartitionKeys(), expected.getPartitionKeys());
assertEquals(actual.getAddresses(), expected.getAddresses());
assertEquals(actual.getPartitionDataColumnCount(), expected.getPartitionDataColumnCount());
assertEquals(actual.getTableToPartitionMapping().getPartitionSchemaDifference(), expected.getTableToPartitionMapping().getPartitionSchemaDifference());
assertEquals(actual.getTableToPartitionMapping().getTableToPartitionColumns(), expected.getTableToPartitionMapping().getTableToPartitionColumns());
assertEquals(actual.getBucketConversion(), expected.getBucketConversion());
assertEquals(actual.getNodeSelectionStrategy(), expected.getNodeSelectionStrategy());
assertEquals(actual.isS3SelectPushdownEnabled(), expected.isS3SelectPushdownEnabled());
assertEquals(actual.getCacheQuotaRequirement(), expected.getCacheQuotaRequirement());
assertEquals(actual.getEncryptionInformation(), expected.getEncryptionInformation());
assertEquals(actual.getCustomSplitInfo(), expected.getCustomSplitInfo());
assertEquals(actual.getSplitWeight(), expected.getSplitWeight());
}
use of com.facebook.presto.hive.metastore.Storage in project presto by prestodb.
the class HiveMetadata method beginCreateTable.
@Override
public HiveOutputTableHandle beginCreateTable(ConnectorSession session, ConnectorTableMetadata tableMetadata, Optional<ConnectorNewTableLayout> layout) {
verifyJvmTimeZone();
if (getExternalLocation(tableMetadata.getProperties()) != null) {
throw new PrestoException(NOT_SUPPORTED, "External tables cannot be created using CREATE TABLE AS");
}
if (getAvroSchemaUrl(tableMetadata.getProperties()) != null) {
throw new PrestoException(NOT_SUPPORTED, "CREATE TABLE AS not supported when Avro schema url is set");
}
HiveStorageFormat tableStorageFormat = getHiveStorageFormat(tableMetadata.getProperties());
List<String> partitionedBy = getPartitionedBy(tableMetadata.getProperties());
Optional<HiveBucketProperty> bucketProperty = getBucketProperty(tableMetadata.getProperties());
List<SortingColumn> preferredOrderingColumns = getPreferredOrderingColumns(tableMetadata.getProperties());
// get the root directory for the database
SchemaTableName schemaTableName = tableMetadata.getTable();
String schemaName = schemaTableName.getSchemaName();
String tableName = schemaTableName.getTableName();
Optional<TableEncryptionProperties> tableEncryptionProperties = getTableEncryptionPropertiesFromTableProperties(tableMetadata, tableStorageFormat, partitionedBy);
List<HiveColumnHandle> columnHandles = getColumnHandles(tableMetadata, ImmutableSet.copyOf(partitionedBy), typeTranslator);
HiveStorageFormat partitionStorageFormat = isRespectTableFormat(session) ? tableStorageFormat : getHiveStorageFormat(session);
// unpartitioned tables ignore the partition storage format
HiveStorageFormat actualStorageFormat = partitionedBy.isEmpty() ? tableStorageFormat : partitionStorageFormat;
validateColumns(actualStorageFormat, columnHandles);
if (tableEncryptionProperties.isPresent() && tableStorageFormat != actualStorageFormat) {
throw new PrestoException(INVALID_TABLE_PROPERTY, format("For encrypted tables, partition format (%s) should match table format (%s). Using the session property %s or appropriately setting %s can help with ensuring this", actualStorageFormat.name(), tableStorageFormat.name(), RESPECT_TABLE_FORMAT, HIVE_STORAGE_FORMAT));
}
MetastoreContext metastoreContext = getMetastoreContext(session);
Map<String, HiveColumnHandle> columnHandlesByName = Maps.uniqueIndex(columnHandles, HiveColumnHandle::getName);
List<Column> partitionColumns = partitionedBy.stream().map(columnHandlesByName::get).map(columnHandle -> columnHandleToColumn(metastoreContext, columnHandle)).collect(toList());
checkPartitionTypesSupported(partitionColumns);
LocationHandle locationHandle = locationService.forNewTable(metastore, session, schemaName, tableName, isTempPathRequired(session, bucketProperty, preferredOrderingColumns));
HdfsContext context = new HdfsContext(session, schemaName, tableName, locationHandle.getTargetPath().toString(), true);
Map<String, String> tableProperties = getEmptyTableProperties(tableMetadata, context, tableStorageFormat, tableEncryptionProperties);
HiveOutputTableHandle result = new HiveOutputTableHandle(schemaName, tableName, columnHandles, metastore.generatePageSinkMetadata(metastoreContext, schemaTableName), locationHandle, tableStorageFormat, partitionStorageFormat, actualStorageFormat, getHiveCompressionCodec(session, false, actualStorageFormat), partitionedBy, bucketProperty, preferredOrderingColumns, session.getUser(), tableProperties, encryptionInformationProvider.getWriteEncryptionInformation(session, tableEncryptionProperties, schemaName, tableName));
WriteInfo writeInfo = locationService.getQueryWriteInfo(locationHandle);
metastore.declareIntentionToWrite(context, metastoreContext, writeInfo.getWriteMode(), writeInfo.getWritePath(), writeInfo.getTempPath(), schemaTableName, false);
return result;
}
use of com.facebook.presto.hive.metastore.Storage in project presto by prestodb.
the class HivePageSourceProvider method getPageSourceFromCursorProvider.
private static Optional<ConnectorPageSource> getPageSourceFromCursorProvider(Set<HiveRecordCursorProvider> cursorProviders, Configuration configuration, ConnectorSession session, Path path, long start, long length, long fileSize, Storage storage, TupleDomain<HiveColumnHandle> effectivePredicate, List<HiveColumnHandle> hiveColumns, DateTimeZone hiveStorageTimeZone, TypeManager typeManager, SchemaTableName tableName, List<HiveColumnHandle> partitionKeyColumnHandles, List<Column> tableDataColumns, Map<String, String> tableParameters, int partitionDataColumnCount, TableToPartitionMapping tableToPartitionMapping, boolean s3SelectPushdownEnabled, RowExpression remainingPredicate, boolean isPushdownFilterEnabled, RowExpressionService rowExpressionService, Map<String, String> customSplitInfo, List<HiveColumnHandle> allColumns, List<ColumnMapping> columnMappings, Set<Integer> outputIndices, List<ColumnMapping> regularAndInterimColumnMappings, Optional<BucketAdaptation> bucketAdaptation) {
if (!hiveColumns.isEmpty() && hiveColumns.stream().allMatch(hiveColumnHandle -> hiveColumnHandle.getColumnType() == AGGREGATED)) {
throw new UnsupportedOperationException("Partial aggregation pushdown only supported for ORC/Parquet files. " + "Table " + tableName.toString() + " has file (" + path.toString() + ") of format " + storage.getStorageFormat().getOutputFormat() + ". Set session property hive.pushdown_partial_aggregations_into_scan=false and execute query again");
}
for (HiveRecordCursorProvider provider : cursorProviders) {
// GenericHiveRecordCursor will automatically do the coercion without HiveCoercionRecordCursor
boolean doCoercion = !(provider instanceof GenericHiveRecordCursorProvider);
List<Column> partitionDataColumns = reconstructPartitionSchema(tableDataColumns, partitionDataColumnCount, tableToPartitionMapping.getPartitionSchemaDifference(), tableToPartitionMapping.getTableToPartitionColumns());
Properties schema = getHiveSchema(storage, partitionDataColumns, tableDataColumns, tableParameters, tableName.getSchemaName(), tableName.getTableName(), partitionKeyColumnHandles.stream().map(column -> column.getName()).collect(toImmutableList()), partitionKeyColumnHandles.stream().map(column -> column.getHiveType()).collect(toImmutableList()));
Optional<RecordCursor> cursor = provider.createRecordCursor(configuration, session, path, start, length, fileSize, schema, toColumnHandles(regularAndInterimColumnMappings, doCoercion), effectivePredicate, hiveStorageTimeZone, typeManager, s3SelectPushdownEnabled, customSplitInfo);
if (cursor.isPresent()) {
RecordCursor delegate = cursor.get();
if (bucketAdaptation.isPresent()) {
delegate = new HiveBucketAdapterRecordCursor(bucketAdaptation.get().getBucketColumnIndices(), bucketAdaptation.get().getBucketColumnHiveTypes(), bucketAdaptation.get().getTableBucketCount(), bucketAdaptation.get().getPartitionBucketCount(), bucketAdaptation.get().getBucketToKeep(), typeManager, delegate);
}
// Need to wrap RcText and RcBinary into a wrapper, which will do the coercion for mismatch columns
if (doCoercion) {
delegate = new HiveCoercionRecordCursor(regularAndInterimColumnMappings, typeManager, delegate);
}
HiveRecordCursor hiveRecordCursor = new HiveRecordCursor(columnMappings, hiveStorageTimeZone, typeManager, delegate);
List<Type> columnTypes = allColumns.stream().map(input -> typeManager.getType(input.getTypeSignature())).collect(toList());
RecordPageSource recordPageSource = new RecordPageSource(columnTypes, hiveRecordCursor);
if (isPushdownFilterEnabled) {
return Optional.of(new FilteringPageSource(columnMappings, effectivePredicate, remainingPredicate, typeManager, rowExpressionService, session, outputIndices, recordPageSource));
}
return Optional.of(recordPageSource);
}
}
return Optional.empty();
}
Aggregations