use of org.apache.parquet.column.ColumnDescriptor in project presto by prestodb.
the class TestTupleDomainParquetPredicate method getEffectivePredicate.
private TupleDomain<ColumnDescriptor> getEffectivePredicate(RichColumnDescriptor column, VarcharType type, Slice value) {
ColumnDescriptor predicateColumn = new ColumnDescriptor(column.getPath(), column.getType(), 0, 0);
Domain predicateDomain = singleValue(type, value);
Map<ColumnDescriptor, Domain> predicateColumns = singletonMap(predicateColumn, predicateDomain);
return withColumnDomains(predicateColumns);
}
use of org.apache.parquet.column.ColumnDescriptor in project presto by prestodb.
the class TestTupleDomainParquetPredicate method testBigint.
@Test
public void testBigint() throws ParquetCorruptionException {
ColumnDescriptor columnDescriptor = createColumnDescriptor(INT64, "BigintColumn");
assertEquals(getDomain(columnDescriptor, BIGINT, 0, null, ID), Domain.all(BIGINT));
assertEquals(getDomain(columnDescriptor, BIGINT, 10, longColumnStats(100L, 100L), ID), singleValue(BIGINT, 100L));
assertEquals(getDomain(columnDescriptor, BIGINT, 10, longColumnStats(0L, 100L), ID), create(ValueSet.ofRanges(range(BIGINT, 0L, true, 100L, true)), false));
assertEquals(getDomain(columnDescriptor, BIGINT, 20, longOnlyNullsStats(10), ID), create(ValueSet.all(BIGINT), true));
// fail on corrupted statistics
assertThatExceptionOfType(ParquetCorruptionException.class).isThrownBy(() -> getDomain(columnDescriptor, BIGINT, 10, longColumnStats(100L, 10L), ID)).withMessage("Corrupted statistics for column \"[] required int64 BigintColumn\" in Parquet file \"testFile\": [min: 100, max: 10, num_nulls: 0]");
}
use of org.apache.parquet.column.ColumnDescriptor in project presto by prestodb.
the class TestTupleDomainParquetPredicate method testBigintMatchesWithStatistics.
@Test
public void testBigintMatchesWithStatistics() throws ParquetCorruptionException {
RichColumnDescriptor column = new RichColumnDescriptor(new ColumnDescriptor(new String[] { "path" }, INT64, 0, 0), new PrimitiveType(OPTIONAL, INT64, "Test column"));
TupleDomain<ColumnDescriptor> effectivePredicate = TupleDomain.withColumnDomains(ImmutableMap.of(column, Domain.create(ValueSet.of(BIGINT, 42L, 43L, 44L, 404L), false)));
TupleDomainParquetPredicate parquetPredicate = new TupleDomainParquetPredicate(effectivePredicate, singletonList(column));
assertTrue(parquetPredicate.matches(2, ImmutableMap.of(column, longColumnStats(32, 42)), ID));
assertFalse(parquetPredicate.matches(2, ImmutableMap.of(column, longColumnStats(30, 40)), ID));
assertFalse(parquetPredicate.matches(2, ImmutableMap.of(column, longColumnStats(1024, 0x10000 + 42)), ID));
}
use of org.apache.parquet.column.ColumnDescriptor in project presto by prestodb.
the class TestTupleDomainParquetPredicate method testBoolean.
@Test
public void testBoolean() throws ParquetCorruptionException {
ColumnDescriptor columnDescriptor = createColumnDescriptor(PrimitiveTypeName.BOOLEAN, "BooleanColumn");
assertEquals(getDomain(columnDescriptor, BOOLEAN, 0, null, ID), Domain.all(BOOLEAN));
assertEquals(getDomain(columnDescriptor, BOOLEAN, 10, booleanColumnStats(true, true), ID), singleValue(BOOLEAN, true));
assertEquals(getDomain(columnDescriptor, BOOLEAN, 10, booleanColumnStats(false, false), ID), singleValue(BOOLEAN, false));
assertEquals(getDomain(columnDescriptor, BOOLEAN, 20, booleanColumnStats(false, true), ID), Domain.all(BOOLEAN));
}
use of org.apache.parquet.column.ColumnDescriptor in project presto by prestodb.
the class IcebergPageSourceProvider method createParquetPageSource.
private static ConnectorPageSource createParquetPageSource(HdfsEnvironment hdfsEnvironment, String user, Configuration configuration, Path path, long start, long length, SchemaTableName tableName, List<IcebergColumnHandle> regularColumns, boolean useParquetColumnNames, DataSize maxReadBlockSize, boolean batchReaderEnabled, boolean verificationEnabled, TupleDomain<IcebergColumnHandle> effectivePredicate, FileFormatDataSourceStats fileFormatDataSourceStats, boolean columnIndexFilterEnabled) {
AggregatedMemoryContext systemMemoryContext = newSimpleAggregatedMemoryContext();
ParquetDataSource dataSource = null;
try {
ExtendedFileSystem fileSystem = hdfsEnvironment.getFileSystem(user, path, configuration);
FileStatus fileStatus = fileSystem.getFileStatus(path);
long fileSize = fileStatus.getLen();
long modificationTime = fileStatus.getModificationTime();
HiveFileContext hiveFileContext = new HiveFileContext(true, NO_CACHE_CONSTRAINTS, Optional.empty(), Optional.of(fileSize), modificationTime, false);
FSDataInputStream inputStream = fileSystem.openFile(path, hiveFileContext);
dataSource = buildHdfsParquetDataSource(inputStream, path, fileFormatDataSourceStats);
ParquetMetadata parquetMetadata = MetadataReader.readFooter(dataSource, fileSize).getParquetMetadata();
FileMetaData fileMetaData = parquetMetadata.getFileMetaData();
MessageType fileSchema = fileMetaData.getSchema();
// Mapping from Iceberg field ID to Parquet fields.
Map<Integer, org.apache.parquet.schema.Type> parquetIdToField = fileSchema.getFields().stream().filter(field -> field.getId() != null).collect(toImmutableMap(field -> field.getId().intValue(), Function.identity()));
List<org.apache.parquet.schema.Type> parquetFields = regularColumns.stream().map(column -> {
if (parquetIdToField.isEmpty()) {
// This is a migrated table
return getParquetTypeByName(column.getName(), fileSchema);
}
return parquetIdToField.get(column.getId());
}).collect(toList());
// TODO: support subfield pushdown
MessageType requestedSchema = new MessageType(fileSchema.getName(), parquetFields.stream().filter(Objects::nonNull).collect(toImmutableList()));
Map<List<String>, RichColumnDescriptor> descriptorsByPath = getDescriptors(fileSchema, requestedSchema);
TupleDomain<ColumnDescriptor> parquetTupleDomain = getParquetTupleDomain(descriptorsByPath, effectivePredicate);
Predicate parquetPredicate = buildPredicate(requestedSchema, parquetTupleDomain, descriptorsByPath);
final ParquetDataSource finalDataSource = dataSource;
List<BlockMetaData> blocks = new ArrayList<>();
List<ColumnIndexStore> blockIndexStores = new ArrayList<>();
for (BlockMetaData block : parquetMetadata.getBlocks()) {
long firstDataPage = block.getColumns().get(0).getFirstDataPageOffset();
Optional<ColumnIndexStore> columnIndexStore = ColumnIndexFilterUtils.getColumnIndexStore(parquetPredicate, finalDataSource, block, descriptorsByPath, columnIndexFilterEnabled);
if ((firstDataPage >= start) && (firstDataPage < (start + length)) && predicateMatches(parquetPredicate, block, dataSource, descriptorsByPath, parquetTupleDomain, columnIndexStore, columnIndexFilterEnabled)) {
blocks.add(block);
blockIndexStores.add(columnIndexStore.orElse(null));
}
}
MessageColumnIO messageColumnIO = getColumnIO(fileSchema, requestedSchema);
ParquetReader parquetReader = new ParquetReader(messageColumnIO, blocks, dataSource, systemMemoryContext, maxReadBlockSize, batchReaderEnabled, verificationEnabled, parquetPredicate, blockIndexStores, columnIndexFilterEnabled);
ImmutableList.Builder<String> namesBuilder = ImmutableList.builder();
ImmutableList.Builder<Type> prestoTypes = ImmutableList.builder();
ImmutableList.Builder<Optional<Field>> internalFields = ImmutableList.builder();
for (int columnIndex = 0; columnIndex < regularColumns.size(); columnIndex++) {
IcebergColumnHandle column = regularColumns.get(columnIndex);
namesBuilder.add(column.getName());
org.apache.parquet.schema.Type parquetField = parquetFields.get(columnIndex);
Type prestoType = column.getType();
prestoTypes.add(prestoType);
if (parquetField == null) {
internalFields.add(Optional.empty());
} else {
internalFields.add(constructField(column.getType(), messageColumnIO.getChild(parquetField.getName())));
}
}
return new ParquetPageSource(parquetReader, prestoTypes.build(), internalFields.build(), namesBuilder.build(), new RuntimeStats());
} catch (Exception e) {
try {
if (dataSource != null) {
dataSource.close();
}
} catch (IOException ignored) {
}
if (e instanceof PrestoException) {
throw (PrestoException) e;
}
String message = format("Error opening Iceberg split %s (offset=%s, length=%s): %s", path, start, length, e.getMessage());
if (e instanceof ParquetCorruptionException) {
throw new PrestoException(ICEBERG_BAD_DATA, message, e);
}
if (e instanceof BlockMissingException) {
throw new PrestoException(ICEBERG_MISSING_DATA, message, e);
}
throw new PrestoException(ICEBERG_CANNOT_OPEN_SPLIT, message, e);
}
}
Aggregations