use of io.trino.orc.OrcColumn in project trino by trinodb.
the class OrcPageSourceFactory method ensureColumnNameConsistency.
/**
* Recreate the list of fileColumns, updating the names of any whose names have changed in the
* corresponding elements of the desiredColumns list. NOTE: this renaming is only applied to
* top-level columns, not nested columns.
*
* @param fileColumns All OrcColumns nested in the root column of the table.
* @param desiredColumns HiveColumnHandles for the metastore's table columns.
* @return Return the fileColumns list with any OrcColumn corresponding to a desiredColumn renamed if
* the names differ from those specified in the desiredColumns.
*/
private static List<OrcColumn> ensureColumnNameConsistency(List<OrcColumn> fileColumns, List<HiveColumnHandle> desiredColumns) {
int columnCount = fileColumns.size();
ImmutableList.Builder<OrcColumn> builder = ImmutableList.builderWithExpectedSize(columnCount);
Map<Integer, HiveColumnHandle> desiredColumnsByNumber = desiredColumns.stream().collect(toImmutableMap(HiveColumnHandle::getBaseHiveColumnIndex, identity()));
for (int index = 0; index < columnCount; index++) {
OrcColumn column = fileColumns.get(index);
HiveColumnHandle handle = desiredColumnsByNumber.get(index);
if (handle != null && !column.getColumnName().equals(handle.getName())) {
column = new OrcColumn(column.getPath(), column.getColumnId(), handle.getName(), column.getColumnType(), column.getOrcDataSourceId(), column.getNestedColumns(), column.getAttributes());
}
builder.add(column);
}
return builder.build();
}
use of io.trino.orc.OrcColumn in project trino by trinodb.
the class IcebergPageSourceProvider method setMissingFieldIds.
private static OrcColumn setMissingFieldIds(OrcColumn column, NameMapping nameMapping, List<String> qualifiedPath) {
MappedField mappedField = nameMapping.find(qualifiedPath);
ImmutableMap.Builder<String, String> attributes = ImmutableMap.<String, String>builder().putAll(column.getAttributes());
if (mappedField != null && mappedField.id() != null) {
attributes.put(ORC_ICEBERG_ID_KEY, String.valueOf(mappedField.id()));
}
return new OrcColumn(column.getPath(), column.getColumnId(), column.getColumnName(), column.getColumnType(), column.getOrcDataSourceId(), column.getNestedColumns().stream().map(nestedColumn -> {
ImmutableList.Builder<String> nextQualifiedPath = ImmutableList.<String>builder().addAll(qualifiedPath);
if (column.getColumnType().equals(OrcType.OrcTypeKind.LIST)) {
// The Trino ORC reader uses "item" for list element names, but the NameMapper expects "element"
nextQualifiedPath.add("element");
} else {
nextQualifiedPath.add(nestedColumn.getColumnName());
}
return setMissingFieldIds(nestedColumn, nameMapping, nextQualifiedPath.build());
}).collect(toImmutableList()), attributes.buildOrThrow());
}
use of io.trino.orc.OrcColumn in project trino by trinodb.
the class IcebergPageSourceProvider method createOrcPageSource.
private static ReaderPageSource createOrcPageSource(HdfsEnvironment hdfsEnvironment, ConnectorIdentity identity, Configuration configuration, Path path, long start, long length, long fileSize, List<IcebergColumnHandle> columns, TupleDomain<IcebergColumnHandle> effectivePredicate, OrcReaderOptions options, FileFormatDataSourceStats stats, TypeManager typeManager, Optional<NameMapping> nameMapping) {
OrcDataSource orcDataSource = null;
try {
FileSystem fileSystem = hdfsEnvironment.getFileSystem(identity, path, configuration);
FSDataInputStream inputStream = hdfsEnvironment.doAs(identity, () -> fileSystem.open(path));
orcDataSource = new HdfsOrcDataSource(new OrcDataSourceId(path.toString()), fileSize, options, inputStream, stats);
OrcReader reader = OrcReader.createOrcReader(orcDataSource, options).orElseThrow(() -> new TrinoException(ICEBERG_BAD_DATA, "ORC file is zero length"));
List<OrcColumn> fileColumns = reader.getRootColumn().getNestedColumns();
if (nameMapping.isPresent() && !hasIds(reader.getRootColumn())) {
fileColumns = fileColumns.stream().map(orcColumn -> setMissingFieldIds(orcColumn, nameMapping.get(), ImmutableList.of(orcColumn.getColumnName()))).collect(toImmutableList());
}
Map<Integer, OrcColumn> fileColumnsByIcebergId = mapIdsToOrcFileColumns(fileColumns);
TupleDomainOrcPredicateBuilder predicateBuilder = TupleDomainOrcPredicate.builder().setBloomFiltersEnabled(options.isBloomFiltersEnabled());
Map<IcebergColumnHandle, Domain> effectivePredicateDomains = effectivePredicate.getDomains().orElseThrow(() -> new IllegalArgumentException("Effective predicate is none"));
Optional<ReaderColumns> columnProjections = projectColumns(columns);
Map<Integer, List<List<Integer>>> projectionsByFieldId = columns.stream().collect(groupingBy(column -> column.getBaseColumnIdentity().getId(), mapping(IcebergColumnHandle::getPath, toUnmodifiableList())));
List<IcebergColumnHandle> readColumns = columnProjections.map(readerColumns -> (List<IcebergColumnHandle>) readerColumns.get().stream().map(IcebergColumnHandle.class::cast).collect(toImmutableList())).orElse(columns);
List<OrcColumn> fileReadColumns = new ArrayList<>(readColumns.size());
List<Type> fileReadTypes = new ArrayList<>(readColumns.size());
List<ProjectedLayout> projectedLayouts = new ArrayList<>(readColumns.size());
List<ColumnAdaptation> columnAdaptations = new ArrayList<>(readColumns.size());
for (IcebergColumnHandle column : readColumns) {
verify(column.isBaseColumn(), "Column projections must be based from a root column");
OrcColumn orcColumn = fileColumnsByIcebergId.get(column.getId());
if (orcColumn != null) {
Type readType = getOrcReadType(column.getType(), typeManager);
if (column.getType() == UUID && !"UUID".equals(orcColumn.getAttributes().get(ICEBERG_BINARY_TYPE))) {
throw new TrinoException(ICEBERG_BAD_DATA, format("Expected ORC column for UUID data to be annotated with %s=UUID: %s", ICEBERG_BINARY_TYPE, orcColumn));
}
List<List<Integer>> fieldIdProjections = projectionsByFieldId.get(column.getId());
ProjectedLayout projectedLayout = IcebergOrcProjectedLayout.createProjectedLayout(orcColumn, fieldIdProjections);
int sourceIndex = fileReadColumns.size();
columnAdaptations.add(ColumnAdaptation.sourceColumn(sourceIndex));
fileReadColumns.add(orcColumn);
fileReadTypes.add(readType);
projectedLayouts.add(projectedLayout);
for (Map.Entry<IcebergColumnHandle, Domain> domainEntry : effectivePredicateDomains.entrySet()) {
IcebergColumnHandle predicateColumn = domainEntry.getKey();
OrcColumn predicateOrcColumn = fileColumnsByIcebergId.get(predicateColumn.getId());
if (predicateOrcColumn != null && column.getColumnIdentity().equals(predicateColumn.getBaseColumnIdentity())) {
predicateBuilder.addColumn(predicateOrcColumn.getColumnId(), domainEntry.getValue());
}
}
} else {
columnAdaptations.add(ColumnAdaptation.nullColumn(column.getType()));
}
}
AggregatedMemoryContext memoryUsage = newSimpleAggregatedMemoryContext();
OrcDataSourceId orcDataSourceId = orcDataSource.getId();
OrcRecordReader recordReader = reader.createRecordReader(fileReadColumns, fileReadTypes, projectedLayouts, predicateBuilder.build(), start, length, UTC, memoryUsage, INITIAL_BATCH_SIZE, exception -> handleException(orcDataSourceId, exception), new IdBasedFieldMapperFactory(readColumns));
return new ReaderPageSource(new OrcPageSource(recordReader, columnAdaptations, orcDataSource, Optional.empty(), Optional.empty(), memoryUsage, stats), columnProjections);
} catch (Exception e) {
if (orcDataSource != null) {
try {
orcDataSource.close();
} catch (IOException ignored) {
}
}
if (e instanceof TrinoException) {
throw (TrinoException) e;
}
String message = format("Error opening Iceberg split %s (offset=%s, length=%s): %s", path, start, length, e.getMessage());
if (e instanceof BlockMissingException) {
throw new TrinoException(ICEBERG_MISSING_DATA, message, e);
}
throw new TrinoException(ICEBERG_CANNOT_OPEN_SPLIT, message, e);
}
}
use of io.trino.orc.OrcColumn in project trino by trinodb.
the class RaptorStorageManager method getPageSource.
@Override
public ConnectorPageSource getPageSource(UUID shardUuid, OptionalInt bucketNumber, List<Long> columnIds, List<Type> columnTypes, TupleDomain<RaptorColumnHandle> effectivePredicate, OrcReaderOptions orcReaderOptions, OptionalLong transactionId) {
orcReaderOptions = orcReaderOptions.withMaxReadBlockSize(HUGE_MAX_READ_BLOCK_SIZE);
OrcDataSource dataSource = openShard(shardUuid, orcReaderOptions);
AggregatedMemoryContext memoryUsage = newSimpleAggregatedMemoryContext();
try {
OrcReader reader = OrcReader.createOrcReader(dataSource, orcReaderOptions).orElseThrow(() -> new TrinoException(RAPTOR_ERROR, "Data file is empty for shard " + shardUuid));
Map<Long, OrcColumn> indexMap = columnIdIndex(reader.getRootColumn().getNestedColumns());
List<OrcColumn> fileReadColumn = new ArrayList<>(columnIds.size());
List<Type> fileReadTypes = new ArrayList<>(columnIds.size());
List<ColumnAdaptation> columnAdaptations = new ArrayList<>(columnIds.size());
for (int i = 0; i < columnIds.size(); i++) {
long columnId = columnIds.get(i);
if (isHiddenColumn(columnId)) {
columnAdaptations.add(specialColumnAdaptation(columnId, shardUuid, bucketNumber));
continue;
}
Type type = toOrcFileType(columnTypes.get(i), typeManager);
OrcColumn fileColumn = indexMap.get(columnId);
if (fileColumn == null) {
columnAdaptations.add(ColumnAdaptation.nullColumn(type));
} else {
int sourceIndex = fileReadColumn.size();
columnAdaptations.add(ColumnAdaptation.sourceColumn(sourceIndex));
fileReadColumn.add(fileColumn);
fileReadTypes.add(type);
}
}
OrcPredicate predicate = getPredicate(effectivePredicate, indexMap);
OrcRecordReader recordReader = reader.createRecordReader(fileReadColumn, fileReadTypes, predicate, UTC, memoryUsage, INITIAL_BATCH_SIZE, RaptorPageSource::handleException);
Optional<ShardRewriter> shardRewriter = Optional.empty();
if (transactionId.isPresent()) {
shardRewriter = Optional.of(createShardRewriter(transactionId.getAsLong(), bucketNumber, shardUuid));
}
return new RaptorPageSource(shardRewriter, recordReader, columnAdaptations, dataSource, memoryUsage);
} catch (IOException | RuntimeException e) {
closeQuietly(dataSource);
throw new TrinoException(RAPTOR_ERROR, "Failed to create page source for shard " + shardUuid, e);
} catch (Throwable t) {
closeQuietly(dataSource);
throw t;
}
}
Aggregations