use of io.trino.parquet.RichColumnDescriptor in project trino by trinodb.
the class ParquetPageSourceFactory method createPageSource.
/**
* This method is available for other callers to use directly.
*/
public static ReaderPageSource createPageSource(Path path, long start, long length, long estimatedFileSize, List<HiveColumnHandle> columns, TupleDomain<HiveColumnHandle> effectivePredicate, boolean useColumnNames, HdfsEnvironment hdfsEnvironment, Configuration configuration, ConnectorIdentity identity, DateTimeZone timeZone, FileFormatDataSourceStats stats, ParquetReaderOptions options) {
// Ignore predicates on partial columns for now.
effectivePredicate = effectivePredicate.filter((column, domain) -> column.isBaseColumn());
MessageType fileSchema;
MessageType requestedSchema;
MessageColumnIO messageColumn;
ParquetReader parquetReader;
ParquetDataSource dataSource = null;
try {
FileSystem fileSystem = hdfsEnvironment.getFileSystem(identity, path, configuration);
FSDataInputStream inputStream = hdfsEnvironment.doAs(identity, () -> fileSystem.open(path));
dataSource = new HdfsParquetDataSource(new ParquetDataSourceId(path.toString()), estimatedFileSize, inputStream, stats, options);
ParquetMetadata parquetMetadata = MetadataReader.readFooter(dataSource);
FileMetaData fileMetaData = parquetMetadata.getFileMetaData();
fileSchema = fileMetaData.getSchema();
Optional<MessageType> message = projectSufficientColumns(columns).map(projection -> projection.get().stream().map(HiveColumnHandle.class::cast).collect(toUnmodifiableList())).orElse(columns).stream().filter(column -> column.getColumnType() == REGULAR).map(column -> getColumnType(column, fileSchema, useColumnNames)).filter(Optional::isPresent).map(Optional::get).map(type -> new MessageType(fileSchema.getName(), type)).reduce(MessageType::union);
requestedSchema = message.orElse(new MessageType(fileSchema.getName(), ImmutableList.of()));
messageColumn = getColumnIO(fileSchema, requestedSchema);
Map<List<String>, RichColumnDescriptor> descriptorsByPath = getDescriptors(fileSchema, requestedSchema);
TupleDomain<ColumnDescriptor> parquetTupleDomain = options.isIgnoreStatistics() ? TupleDomain.all() : getParquetTupleDomain(descriptorsByPath, effectivePredicate, fileSchema, useColumnNames);
Predicate parquetPredicate = buildPredicate(requestedSchema, parquetTupleDomain, descriptorsByPath, timeZone);
long nextStart = 0;
ImmutableList.Builder<BlockMetaData> blocks = ImmutableList.builder();
ImmutableList.Builder<Long> blockStarts = ImmutableList.builder();
ImmutableList.Builder<Optional<ColumnIndexStore>> columnIndexes = ImmutableList.builder();
for (BlockMetaData block : parquetMetadata.getBlocks()) {
long firstDataPage = block.getColumns().get(0).getFirstDataPageOffset();
Optional<ColumnIndexStore> columnIndex = getColumnIndexStore(dataSource, block, descriptorsByPath, parquetTupleDomain, options);
if (start <= firstDataPage && firstDataPage < start + length && predicateMatches(parquetPredicate, block, dataSource, descriptorsByPath, parquetTupleDomain, columnIndex)) {
blocks.add(block);
blockStarts.add(nextStart);
columnIndexes.add(columnIndex);
}
nextStart += block.getRowCount();
}
parquetReader = new ParquetReader(Optional.ofNullable(fileMetaData.getCreatedBy()), messageColumn, blocks.build(), Optional.of(blockStarts.build()), dataSource, timeZone, newSimpleAggregatedMemoryContext(), options, parquetPredicate, columnIndexes.build());
} catch (Exception e) {
try {
if (dataSource != null) {
dataSource.close();
}
} catch (IOException ignored) {
}
if (e instanceof TrinoException) {
throw (TrinoException) e;
}
if (e instanceof ParquetCorruptionException) {
throw new TrinoException(HIVE_BAD_DATA, e);
}
if (nullToEmpty(e.getMessage()).trim().equals("Filesystem closed") || e instanceof FileNotFoundException) {
throw new TrinoException(HIVE_CANNOT_OPEN_SPLIT, e);
}
String message = format("Error opening Hive split %s (offset=%s, length=%s): %s", path, start, length, e.getMessage());
if (e instanceof BlockMissingException) {
throw new TrinoException(HIVE_MISSING_DATA, message, e);
}
throw new TrinoException(HIVE_CANNOT_OPEN_SPLIT, message, e);
}
Optional<ReaderColumns> readerProjections = projectBaseColumns(columns);
List<HiveColumnHandle> baseColumns = readerProjections.map(projection -> projection.get().stream().map(HiveColumnHandle.class::cast).collect(toUnmodifiableList())).orElse(columns);
for (HiveColumnHandle column : baseColumns) {
checkArgument(column == PARQUET_ROW_INDEX_COLUMN || column.getColumnType() == REGULAR, "column type must be REGULAR: %s", column);
}
ImmutableList.Builder<Type> trinoTypes = ImmutableList.builder();
ImmutableList.Builder<Optional<Field>> internalFields = ImmutableList.builder();
ImmutableList.Builder<Boolean> rowIndexColumns = ImmutableList.builder();
for (HiveColumnHandle column : baseColumns) {
trinoTypes.add(column.getBaseType());
rowIndexColumns.add(column == PARQUET_ROW_INDEX_COLUMN);
if (column == PARQUET_ROW_INDEX_COLUMN) {
internalFields.add(Optional.empty());
} else {
internalFields.add(Optional.ofNullable(getParquetType(column, fileSchema, useColumnNames)).flatMap(field -> {
String columnName = useColumnNames ? column.getBaseColumnName() : fileSchema.getFields().get(column.getBaseHiveColumnIndex()).getName();
return constructField(column.getBaseType(), lookupColumnByName(messageColumn, columnName));
}));
}
}
ConnectorPageSource parquetPageSource = new ParquetPageSource(parquetReader, trinoTypes.build(), rowIndexColumns.build(), internalFields.build());
return new ReaderPageSource(parquetPageSource, readerProjections);
}
use of io.trino.parquet.RichColumnDescriptor in project trino by trinodb.
the class HiveParquetColumnIOConverter method constructField.
public static Optional<Field> constructField(Type type, ColumnIO columnIO) {
if (columnIO == null) {
return Optional.empty();
}
boolean required = columnIO.getType().getRepetition() != OPTIONAL;
int repetitionLevel = columnRepetitionLevel(columnIO);
int definitionLevel = columnDefinitionLevel(columnIO);
if (type instanceof RowType) {
RowType rowType = (RowType) type;
GroupColumnIO groupColumnIO = (GroupColumnIO) columnIO;
ImmutableList.Builder<Optional<Field>> fieldsBuilder = ImmutableList.builder();
List<RowType.Field> fields = rowType.getFields();
boolean structHasParameters = false;
for (int i = 0; i < fields.size(); i++) {
RowType.Field rowField = fields.get(i);
String name = rowField.getName().orElseThrow().toLowerCase(Locale.ENGLISH);
Optional<Field> field = constructField(rowField.getType(), lookupColumnByName(groupColumnIO, name));
structHasParameters |= field.isPresent();
fieldsBuilder.add(field);
}
if (structHasParameters) {
return Optional.of(new GroupField(type, repetitionLevel, definitionLevel, required, fieldsBuilder.build()));
}
return Optional.empty();
}
if (type instanceof MapType) {
MapType mapType = (MapType) type;
GroupColumnIO groupColumnIO = (GroupColumnIO) columnIO;
GroupColumnIO keyValueColumnIO = getMapKeyValueColumn(groupColumnIO);
if (keyValueColumnIO.getChildrenCount() != 2) {
return Optional.empty();
}
Optional<Field> keyField = constructField(mapType.getKeyType(), keyValueColumnIO.getChild(0));
Optional<Field> valueField = constructField(mapType.getValueType(), keyValueColumnIO.getChild(1));
return Optional.of(new GroupField(type, repetitionLevel, definitionLevel, required, ImmutableList.of(keyField, valueField)));
}
if (type instanceof ArrayType) {
ArrayType arrayType = (ArrayType) type;
GroupColumnIO groupColumnIO = (GroupColumnIO) columnIO;
if (groupColumnIO.getChildrenCount() != 1) {
return Optional.empty();
}
Optional<Field> field = constructField(arrayType.getElementType(), getArrayElementColumn(groupColumnIO.getChild(0)));
return Optional.of(new GroupField(type, repetitionLevel, definitionLevel, required, ImmutableList.of(field)));
}
PrimitiveColumnIO primitiveColumnIO = (PrimitiveColumnIO) columnIO;
RichColumnDescriptor column = new RichColumnDescriptor(primitiveColumnIO.getColumnDescriptor(), columnIO.getType().asPrimitiveType());
return Optional.of(new PrimitiveField(type, repetitionLevel, definitionLevel, required, column, primitiveColumnIO.getId()));
}
use of io.trino.parquet.RichColumnDescriptor in project trino by trinodb.
the class TupleDomainParquetPredicate method convertToParquetFilter.
private FilterPredicate convertToParquetFilter(DateTimeZone timeZone) {
FilterPredicate filter = null;
for (RichColumnDescriptor column : columns) {
Domain domain = effectivePredicate.getDomains().get().get(column);
if (domain == null || domain.isNone()) {
continue;
}
if (domain.isAll()) {
continue;
}
FilterPredicate columnFilter = FilterApi.userDefined(new TrinoIntColumn(ColumnPath.get(column.getPath())), new DomainUserDefinedPredicate<>(column, domain, timeZone));
if (filter == null) {
filter = columnFilter;
} else {
filter = FilterApi.and(filter, columnFilter);
}
}
return filter;
}
use of io.trino.parquet.RichColumnDescriptor in project trino by trinodb.
the class TestParquetPredicateUtils method testParquetTupleDomainStructArray.
@Test
public void testParquetTupleDomainStructArray() {
RowType.Field rowField = new RowType.Field(Optional.of("a"), INTEGER);
RowType rowType = RowType.from(ImmutableList.of(rowField));
HiveColumnHandle columnHandle = createBaseColumn("my_array_struct", 0, HiveType.valueOf("array<struct<a:int>>"), rowType, REGULAR, Optional.empty());
TupleDomain<HiveColumnHandle> domain = withColumnDomains(ImmutableMap.of(columnHandle, Domain.notNull(new ArrayType(rowType))));
MessageType fileSchema = new MessageType("hive_schema", new GroupType(OPTIONAL, "my_array_struct", new GroupType(REPEATED, "bag", new GroupType(OPTIONAL, "array_element", new PrimitiveType(OPTIONAL, INT32, "a")))));
Map<List<String>, RichColumnDescriptor> descriptorsByPath = getDescriptors(fileSchema, fileSchema);
TupleDomain<ColumnDescriptor> tupleDomain = getParquetTupleDomain(descriptorsByPath, domain, fileSchema, true);
assertTrue(tupleDomain.isAll());
}
use of io.trino.parquet.RichColumnDescriptor in project trino by trinodb.
the class TestParquetPredicateUtils method testParquetTupleDomainPrimitive.
@Test
public void testParquetTupleDomainPrimitive() {
HiveColumnHandle columnHandle = createBaseColumn("my_primitive", 0, HiveType.valueOf("bigint"), BIGINT, REGULAR, Optional.empty());
Domain singleValueDomain = Domain.singleValue(BIGINT, 123L);
TupleDomain<HiveColumnHandle> domain = withColumnDomains(ImmutableMap.of(columnHandle, singleValueDomain));
MessageType fileSchema = new MessageType("hive_schema", new PrimitiveType(OPTIONAL, INT64, "my_primitive"));
Map<List<String>, RichColumnDescriptor> descriptorsByPath = getDescriptors(fileSchema, fileSchema);
TupleDomain<ColumnDescriptor> tupleDomain = getParquetTupleDomain(descriptorsByPath, domain, fileSchema, true);
assertEquals(tupleDomain.getDomains().get().size(), 1);
ColumnDescriptor descriptor = tupleDomain.getDomains().get().keySet().iterator().next();
assertEquals(descriptor.getPath().length, 1);
assertEquals(descriptor.getPath()[0], "my_primitive");
Domain predicateDomain = Iterables.getOnlyElement(tupleDomain.getDomains().get().values());
assertEquals(predicateDomain, singleValueDomain);
}
Aggregations