use of io.trino.plugin.hive.HiveColumnProjectionInfo in project trino by trinodb.
the class OrcPageSourceFactory method getNestedColumn.
private static OrcColumn getNestedColumn(OrcColumn baseColumn, Optional<HiveColumnProjectionInfo> projectionInfo) {
if (projectionInfo.isEmpty()) {
return baseColumn;
}
OrcColumn current = baseColumn;
for (String field : projectionInfo.get().getDereferenceNames()) {
Optional<OrcColumn> orcColumn = current.getNestedColumns().stream().filter(column -> column.getColumnName().toLowerCase(ENGLISH).equals(field)).findFirst();
if (orcColumn.isEmpty()) {
return null;
}
current = orcColumn.get();
}
return current;
}
use of io.trino.plugin.hive.HiveColumnProjectionInfo in project trino by trinodb.
the class OrcPageSourceFactory method createOrcPageSource.
private ConnectorPageSource createOrcPageSource(HdfsEnvironment hdfsEnvironment, ConnectorIdentity identity, Configuration configuration, Path path, long start, long length, long estimatedFileSize, List<HiveColumnHandle> columns, List<HiveColumnHandle> projections, boolean useOrcColumnNames, boolean isFullAcid, TupleDomain<HiveColumnHandle> effectivePredicate, DateTimeZone legacyFileTimeZone, OrcReaderOptions options, Optional<AcidInfo> acidInfo, OptionalInt bucketNumber, boolean originalFile, AcidTransaction transaction, FileFormatDataSourceStats stats) {
for (HiveColumnHandle column : columns) {
checkArgument(column.getColumnType() == REGULAR, "column type must be regular: %s", column);
}
checkArgument(!effectivePredicate.isNone());
OrcDataSource orcDataSource;
boolean originalFilesPresent = acidInfo.isPresent() && !acidInfo.get().getOriginalFiles().isEmpty();
try {
FileSystem fileSystem = hdfsEnvironment.getFileSystem(identity, path, configuration);
FSDataInputStream inputStream = hdfsEnvironment.doAs(identity, () -> fileSystem.open(path));
orcDataSource = new HdfsOrcDataSource(new OrcDataSourceId(path.toString()), estimatedFileSize, options, inputStream, stats);
} catch (Exception e) {
if (nullToEmpty(e.getMessage()).trim().equals("Filesystem closed") || e instanceof FileNotFoundException) {
throw new TrinoException(HIVE_CANNOT_OPEN_SPLIT, e);
}
throw new TrinoException(HIVE_CANNOT_OPEN_SPLIT, splitError(e, path, start, length), e);
}
AggregatedMemoryContext memoryUsage = newSimpleAggregatedMemoryContext();
try {
Optional<OrcReader> optionalOrcReader = OrcReader.createOrcReader(orcDataSource, options);
if (optionalOrcReader.isEmpty()) {
return new EmptyPageSource();
}
OrcReader reader = optionalOrcReader.get();
if (!originalFile && acidInfo.isPresent() && !acidInfo.get().isOrcAcidVersionValidated()) {
validateOrcAcidVersion(path, reader);
}
List<OrcColumn> fileColumns = reader.getRootColumn().getNestedColumns();
int actualColumnCount = columns.size() + (isFullAcid ? 3 : 0);
List<OrcColumn> fileReadColumns = new ArrayList<>(actualColumnCount);
List<Type> fileReadTypes = new ArrayList<>(actualColumnCount);
List<OrcReader.ProjectedLayout> fileReadLayouts = new ArrayList<>(actualColumnCount);
if (isFullAcid && !originalFilesPresent) {
verifyAcidSchema(reader, path);
Map<String, OrcColumn> acidColumnsByName = uniqueIndex(fileColumns, orcColumn -> orcColumn.getColumnName().toLowerCase(ENGLISH));
fileColumns = ensureColumnNameConsistency(acidColumnsByName.get(AcidSchema.ACID_COLUMN_ROW_STRUCT.toLowerCase(ENGLISH)).getNestedColumns(), columns);
fileReadColumns.add(acidColumnsByName.get(AcidSchema.ACID_COLUMN_ORIGINAL_TRANSACTION.toLowerCase(ENGLISH)));
fileReadTypes.add(BIGINT);
fileReadLayouts.add(fullyProjectedLayout());
fileReadColumns.add(acidColumnsByName.get(AcidSchema.ACID_COLUMN_BUCKET.toLowerCase(ENGLISH)));
fileReadTypes.add(INTEGER);
fileReadLayouts.add(fullyProjectedLayout());
fileReadColumns.add(acidColumnsByName.get(AcidSchema.ACID_COLUMN_ROW_ID.toLowerCase(ENGLISH)));
fileReadTypes.add(BIGINT);
fileReadLayouts.add(fullyProjectedLayout());
}
Map<String, OrcColumn> fileColumnsByName = ImmutableMap.of();
if (useOrcColumnNames || isFullAcid) {
verifyFileHasColumnNames(fileColumns, path);
// Convert column names read from ORC files to lower case to be consistent with those stored in Hive Metastore
fileColumnsByName = uniqueIndex(fileColumns, orcColumn -> orcColumn.getColumnName().toLowerCase(ENGLISH));
}
Map<String, List<List<String>>> projectionsByColumnName = ImmutableMap.of();
Map<Integer, List<List<String>>> projectionsByColumnIndex = ImmutableMap.of();
if (useOrcColumnNames || isFullAcid) {
projectionsByColumnName = projections.stream().collect(Collectors.groupingBy(HiveColumnHandle::getBaseColumnName, mapping(OrcPageSourceFactory::getDereferencesAsList, toList())));
} else {
projectionsByColumnIndex = projections.stream().collect(Collectors.groupingBy(HiveColumnHandle::getBaseHiveColumnIndex, mapping(OrcPageSourceFactory::getDereferencesAsList, toList())));
}
TupleDomainOrcPredicateBuilder predicateBuilder = TupleDomainOrcPredicate.builder().setBloomFiltersEnabled(options.isBloomFiltersEnabled()).setDomainCompactionThreshold(domainCompactionThreshold);
Map<HiveColumnHandle, Domain> effectivePredicateDomains = effectivePredicate.getDomains().orElseThrow(() -> new IllegalArgumentException("Effective predicate is none"));
List<ColumnAdaptation> columnAdaptations = new ArrayList<>(columns.size());
for (HiveColumnHandle column : columns) {
OrcColumn orcColumn = null;
OrcReader.ProjectedLayout projectedLayout = null;
Map<Optional<HiveColumnProjectionInfo>, Domain> columnDomains = null;
if (useOrcColumnNames || isFullAcid) {
String columnName = column.getName().toLowerCase(ENGLISH);
orcColumn = fileColumnsByName.get(columnName);
if (orcColumn != null) {
projectedLayout = createProjectedLayout(orcColumn, projectionsByColumnName.get(columnName));
columnDomains = effectivePredicateDomains.entrySet().stream().filter(columnDomain -> columnDomain.getKey().getBaseColumnName().toLowerCase(ENGLISH).equals(columnName)).collect(toImmutableMap(columnDomain -> columnDomain.getKey().getHiveColumnProjectionInfo(), Map.Entry::getValue));
}
} else if (column.getBaseHiveColumnIndex() < fileColumns.size()) {
orcColumn = fileColumns.get(column.getBaseHiveColumnIndex());
if (orcColumn != null) {
projectedLayout = createProjectedLayout(orcColumn, projectionsByColumnIndex.get(column.getBaseHiveColumnIndex()));
columnDomains = effectivePredicateDomains.entrySet().stream().filter(columnDomain -> columnDomain.getKey().getBaseHiveColumnIndex() == column.getBaseHiveColumnIndex()).collect(toImmutableMap(columnDomain -> columnDomain.getKey().getHiveColumnProjectionInfo(), Map.Entry::getValue));
}
}
Type readType = column.getType();
if (orcColumn != null) {
int sourceIndex = fileReadColumns.size();
columnAdaptations.add(ColumnAdaptation.sourceColumn(sourceIndex));
fileReadColumns.add(orcColumn);
fileReadTypes.add(readType);
fileReadLayouts.add(projectedLayout);
// Add predicates on top-level and nested columns
for (Map.Entry<Optional<HiveColumnProjectionInfo>, Domain> columnDomain : columnDomains.entrySet()) {
OrcColumn nestedColumn = getNestedColumn(orcColumn, columnDomain.getKey());
if (nestedColumn != null) {
predicateBuilder.addColumn(nestedColumn.getColumnId(), columnDomain.getValue());
}
}
} else {
columnAdaptations.add(ColumnAdaptation.nullColumn(readType));
}
}
OrcRecordReader recordReader = reader.createRecordReader(fileReadColumns, fileReadTypes, fileReadLayouts, predicateBuilder.build(), start, length, legacyFileTimeZone, memoryUsage, INITIAL_BATCH_SIZE, exception -> handleException(orcDataSource.getId(), exception), NameBasedFieldMapper::create);
Optional<OrcDeletedRows> deletedRows = acidInfo.map(info -> new OrcDeletedRows(path.getName(), new OrcDeleteDeltaPageSourceFactory(options, identity, configuration, hdfsEnvironment, stats), identity, configuration, hdfsEnvironment, info, bucketNumber, memoryUsage));
Optional<Long> originalFileRowId = acidInfo.filter(OrcPageSourceFactory::hasOriginalFiles).map(info -> OriginalFilesUtils.getPrecedingRowCount(acidInfo.get().getOriginalFiles(), path, hdfsEnvironment, identity, options, configuration, stats));
if (transaction.isDelete()) {
if (originalFile) {
int bucket = bucketNumber.orElse(0);
long startingRowId = originalFileRowId.orElse(0L);
columnAdaptations.add(ColumnAdaptation.originalFileRowIdColumn(startingRowId, bucket));
} else {
columnAdaptations.add(ColumnAdaptation.rowIdColumn());
}
} else if (transaction.isUpdate()) {
HiveUpdateProcessor updateProcessor = transaction.getUpdateProcessor().orElseThrow(() -> new IllegalArgumentException("updateProcessor not present"));
List<HiveColumnHandle> dependencyColumns = projections.stream().filter(HiveColumnHandle::isBaseColumn).collect(toImmutableList());
if (originalFile) {
int bucket = bucketNumber.orElse(0);
long startingRowId = originalFileRowId.orElse(0L);
columnAdaptations.add(updatedRowColumnsWithOriginalFiles(startingRowId, bucket, updateProcessor, dependencyColumns));
} else {
columnAdaptations.add(updatedRowColumns(updateProcessor, dependencyColumns));
}
}
return new OrcPageSource(recordReader, columnAdaptations, orcDataSource, deletedRows, originalFileRowId, memoryUsage, stats);
} catch (Exception e) {
try {
orcDataSource.close();
} catch (IOException ignored) {
}
if (e instanceof TrinoException) {
throw (TrinoException) e;
}
String message = splitError(e, path, start, length);
if (e instanceof BlockMissingException) {
throw new TrinoException(HIVE_MISSING_DATA, message, e);
}
throw new TrinoException(HIVE_CANNOT_OPEN_SPLIT, message, e);
}
}
use of io.trino.plugin.hive.HiveColumnProjectionInfo in project trino by trinodb.
the class TestParquetPageSourceFactory method testGetNestedMixedRepetitionColumnType.
@Test
public void testGetNestedMixedRepetitionColumnType() {
RowType rowType = rowType(RowType.field("optional_level2", rowType(RowType.field("required_level3", IntegerType.INTEGER))));
HiveColumnHandle columnHandle = new HiveColumnHandle("optional_level1", 0, HiveType.valueOf("struct<optional_level2:struct<required_level3:int>>"), rowType, Optional.of(new HiveColumnProjectionInfo(ImmutableList.of(1, 1), ImmutableList.of("optional_level2", "required_level3"), toHiveType(IntegerType.INTEGER), IntegerType.INTEGER)), REGULAR, Optional.empty());
MessageType fileSchema = new MessageType("hive_schema", new GroupType(OPTIONAL, "optional_level1", new GroupType(OPTIONAL, "optional_level2", new PrimitiveType(REQUIRED, INT32, "required_level3"))));
assertEquals(ParquetPageSourceFactory.getColumnType(columnHandle, fileSchema, true).get(), fileSchema.getType("optional_level1"));
}
use of io.trino.plugin.hive.HiveColumnProjectionInfo in project trino by trinodb.
the class TestConnectorPushdownRulesWithHive method testPushdownWithDuplicateExpressions.
@Test
public void testPushdownWithDuplicateExpressions() {
String tableName = "duplicate_expressions";
tester().getQueryRunner().execute(format("CREATE TABLE %s (struct_of_bigint, just_bigint) AS SELECT cast(row(5, 6) AS row(a bigint, b bigint)) AS struct_of_int, 5 AS just_bigint WHERE false", tableName));
PushProjectionIntoTableScan pushProjectionIntoTableScan = new PushProjectionIntoTableScan(tester().getPlannerContext(), tester().getTypeAnalyzer(), new ScalarStatsCalculator(tester().getPlannerContext(), tester().getTypeAnalyzer()));
HiveTableHandle hiveTable = new HiveTableHandle(SCHEMA_NAME, tableName, ImmutableMap.of(), ImmutableList.of(), ImmutableList.of(), Optional.empty());
TableHandle table = new TableHandle(new CatalogName(HIVE_CATALOG_NAME), hiveTable, new HiveTransactionHandle(false));
HiveColumnHandle bigintColumn = createBaseColumn("just_bigint", 1, toHiveType(BIGINT), BIGINT, REGULAR, Optional.empty());
HiveColumnHandle partialColumn = new HiveColumnHandle("struct_of_bigint", 0, toHiveType(ROW_TYPE), ROW_TYPE, Optional.of(new HiveColumnProjectionInfo(ImmutableList.of(0), ImmutableList.of("a"), toHiveType(BIGINT), BIGINT)), REGULAR, Optional.empty());
// Test projection pushdown with duplicate column references
tester().assertThat(pushProjectionIntoTableScan).on(p -> {
SymbolReference column = p.symbol("just_bigint", BIGINT).toSymbolReference();
Expression negation = new ArithmeticUnaryExpression(MINUS, column);
return p.project(Assignments.of(// The column reference is part of both the assignments
p.symbol("column_ref", BIGINT), column, p.symbol("negated_column_ref", BIGINT), negation), p.tableScan(table, ImmutableList.of(p.symbol("just_bigint", BIGINT)), ImmutableMap.of(p.symbol("just_bigint", BIGINT), bigintColumn)));
}).matches(project(ImmutableMap.of("column_ref", expression("just_bigint_0"), "negated_column_ref", expression("- just_bigint_0")), tableScan(hiveTable.withProjectedColumns(ImmutableSet.of(bigintColumn))::equals, TupleDomain.all(), ImmutableMap.of("just_bigint_0", bigintColumn::equals))));
// Test Dereference pushdown
tester().assertThat(pushProjectionIntoTableScan).on(p -> {
SubscriptExpression subscript = new SubscriptExpression(p.symbol("struct_of_bigint", ROW_TYPE).toSymbolReference(), new LongLiteral("1"));
Expression sum = new ArithmeticBinaryExpression(ADD, subscript, new LongLiteral("2"));
return p.project(Assignments.of(// The subscript expression instance is part of both the assignments
p.symbol("expr_deref", BIGINT), subscript, p.symbol("expr_deref_2", BIGINT), sum), p.tableScan(table, ImmutableList.of(p.symbol("struct_of_bigint", ROW_TYPE)), ImmutableMap.of(p.symbol("struct_of_bigint", ROW_TYPE), partialColumn.getBaseColumn())));
}).matches(project(ImmutableMap.of("expr_deref", expression(new SymbolReference("struct_of_bigint#a")), "expr_deref_2", expression(new ArithmeticBinaryExpression(ADD, new SymbolReference("struct_of_bigint#a"), new LongLiteral("2")))), tableScan(hiveTable.withProjectedColumns(ImmutableSet.of(partialColumn))::equals, TupleDomain.all(), ImmutableMap.of("struct_of_bigint#a", partialColumn::equals))));
metastore.dropTable(SCHEMA_NAME, tableName, true);
}
use of io.trino.plugin.hive.HiveColumnProjectionInfo in project trino by trinodb.
the class TestConnectorPushdownRulesWithHive method testProjectionPushdown.
@Test
public void testProjectionPushdown() {
String tableName = "projection_test";
PushProjectionIntoTableScan pushProjectionIntoTableScan = new PushProjectionIntoTableScan(tester().getPlannerContext(), tester().getTypeAnalyzer(), new ScalarStatsCalculator(tester().getPlannerContext(), tester().getTypeAnalyzer()));
tester().getQueryRunner().execute(format("CREATE TABLE %s (struct_of_int) AS " + "SELECT cast(row(5, 6) as row(a bigint, b bigint)) as struct_of_int where false", tableName));
Type baseType = ROW_TYPE;
HiveColumnHandle partialColumn = new HiveColumnHandle("struct_of_int", 0, toHiveType(baseType), baseType, Optional.of(new HiveColumnProjectionInfo(ImmutableList.of(0), ImmutableList.of("a"), toHiveType(BIGINT), BIGINT)), REGULAR, Optional.empty());
HiveTableHandle hiveTable = new HiveTableHandle(SCHEMA_NAME, tableName, ImmutableMap.of(), ImmutableList.of(), ImmutableList.of(), Optional.empty());
TableHandle table = new TableHandle(new CatalogName(HIVE_CATALOG_NAME), hiveTable, new HiveTransactionHandle(false));
HiveColumnHandle fullColumn = partialColumn.getBaseColumn();
// Test projected columns pushdown to HiveTableHandle in case of full column references
tester().assertThat(pushProjectionIntoTableScan).on(p -> p.project(Assignments.of(p.symbol("struct_of_int", baseType), p.symbol("struct_of_int", baseType).toSymbolReference()), p.tableScan(table, ImmutableList.of(p.symbol("struct_of_int", baseType)), ImmutableMap.of(p.symbol("struct_of_int", baseType), fullColumn)))).matches(project(ImmutableMap.of("expr", expression("col")), tableScan(hiveTable.withProjectedColumns(ImmutableSet.of(fullColumn))::equals, TupleDomain.all(), ImmutableMap.of("col", fullColumn::equals))));
// Rule should return Optional.empty after projected ColumnHandles have been added to HiveTableHandle
tester().assertThat(pushProjectionIntoTableScan).on(p -> p.project(Assignments.of(p.symbol("struct_of_int", baseType), p.symbol("struct_of_int", baseType).toSymbolReference()), p.tableScan(new TableHandle(new CatalogName(HIVE_CATALOG_NAME), hiveTable.withProjectedColumns(ImmutableSet.of(fullColumn)), new HiveTransactionHandle(false)), ImmutableList.of(p.symbol("struct_of_int", baseType)), ImmutableMap.of(p.symbol("struct_of_int", baseType), fullColumn)))).doesNotFire();
// Test Dereference pushdown
tester().assertThat(pushProjectionIntoTableScan).on(p -> p.project(Assignments.of(p.symbol("expr_deref", BIGINT), new SubscriptExpression(p.symbol("struct_of_int", baseType).toSymbolReference(), new LongLiteral("1"))), p.tableScan(table, ImmutableList.of(p.symbol("struct_of_int", baseType)), ImmutableMap.of(p.symbol("struct_of_int", baseType), fullColumn)))).matches(project(ImmutableMap.of("expr_deref", expression(new SymbolReference("struct_of_int#a"))), tableScan(hiveTable.withProjectedColumns(ImmutableSet.of(partialColumn))::equals, TupleDomain.all(), ImmutableMap.of("struct_of_int#a", partialColumn::equals))));
metastore.dropTable(SCHEMA_NAME, tableName, true);
}
Aggregations