use of com.facebook.presto.parquet.cache.ParquetMetadataSource in project presto by prestodb.
the class AbstractTestParquetReader method testCaching.
@Test
public void testCaching() throws Exception {
Cache<ParquetDataSourceId, ParquetFileMetadata> parquetFileMetadataCache = CacheBuilder.newBuilder().maximumWeight(new DataSize(1, MEGABYTE).toBytes()).weigher((id, metadata) -> ((ParquetFileMetadata) metadata).getMetadataSize()).expireAfterAccess(new Duration(10, MINUTES).toMillis(), MILLISECONDS).recordStats().build();
ParquetMetadataSource parquetMetadataSource = new CachingParquetMetadataSource(parquetFileMetadataCache, new MetadataReader());
try (ParquetTester.TempFile tempFile = new ParquetTester.TempFile("test", "parquet")) {
Iterable<Integer> values = intsBetween(0, 10);
Iterator<?>[] readValues = stream(new Iterable<?>[] { values }).map(Iterable::iterator).toArray(size -> new Iterator<?>[size]);
List<String> columnNames = singletonList("column1");
List<Type> columnTypes = singletonList(INTEGER);
writeParquetFileFromPresto(tempFile.getFile(), columnTypes, columnNames, readValues, 10, CompressionCodecName.GZIP);
testSingleRead(new Iterable<?>[] { values }, columnNames, columnTypes, parquetMetadataSource, tempFile.getFile());
assertEquals(parquetFileMetadataCache.stats().missCount(), 1);
assertEquals(parquetFileMetadataCache.stats().hitCount(), 0);
testSingleRead(new Iterable<?>[] { values }, columnNames, columnTypes, parquetMetadataSource, tempFile.getFile());
assertEquals(parquetFileMetadataCache.stats().missCount(), 1);
assertEquals(parquetFileMetadataCache.stats().hitCount(), 1);
testSingleRead(new Iterable<?>[] { values }, columnNames, columnTypes, parquetMetadataSource, tempFile.getFile());
assertEquals(parquetFileMetadataCache.stats().missCount(), 1);
assertEquals(parquetFileMetadataCache.stats().hitCount(), 2);
parquetFileMetadataCache.invalidateAll();
testSingleRead(new Iterable<?>[] { values }, columnNames, columnTypes, parquetMetadataSource, tempFile.getFile());
assertEquals(parquetFileMetadataCache.stats().missCount(), 2);
assertEquals(parquetFileMetadataCache.stats().hitCount(), 2);
testSingleRead(new Iterable<?>[] { values }, columnNames, columnTypes, parquetMetadataSource, tempFile.getFile());
assertEquals(parquetFileMetadataCache.stats().missCount(), 2);
assertEquals(parquetFileMetadataCache.stats().hitCount(), 3);
}
}
use of com.facebook.presto.parquet.cache.ParquetMetadataSource in project presto by prestodb.
the class HiveClientModule method createParquetMetadataSource.
@Singleton
@Provides
public ParquetMetadataSource createParquetMetadataSource(ParquetCacheConfig parquetCacheConfig, MBeanExporter exporter) {
ParquetMetadataSource parquetMetadataSource = new MetadataReader();
if (parquetCacheConfig.isMetadataCacheEnabled()) {
Cache<ParquetDataSourceId, ParquetFileMetadata> cache = CacheBuilder.newBuilder().maximumWeight(parquetCacheConfig.getMetadataCacheSize().toBytes()).weigher((id, metadata) -> ((ParquetFileMetadata) metadata).getMetadataSize()).expireAfterAccess(parquetCacheConfig.getMetadataCacheTtlSinceLastAccess().toMillis(), MILLISECONDS).recordStats().build();
CacheStatsMBean cacheStatsMBean = new CacheStatsMBean(cache);
parquetMetadataSource = new CachingParquetMetadataSource(cache, parquetMetadataSource);
exporter.export(generatedNameOf(CacheStatsMBean.class, connectorId + "_ParquetMetadata"), cacheStatsMBean);
}
return parquetMetadataSource;
}
use of com.facebook.presto.parquet.cache.ParquetMetadataSource in project presto by prestodb.
the class ParquetPageSourceFactory method createParquetPageSource.
public static ConnectorPageSource createParquetPageSource(HdfsEnvironment hdfsEnvironment, String user, Configuration configuration, Path path, long start, long length, long fileSize, List<HiveColumnHandle> columns, SchemaTableName tableName, boolean useParquetColumnNames, DataSize maxReadBlockSize, boolean batchReaderEnabled, boolean verificationEnabled, TypeManager typeManager, StandardFunctionResolution functionResolution, TupleDomain<HiveColumnHandle> effectivePredicate, FileFormatDataSourceStats stats, HiveFileContext hiveFileContext, ParquetMetadataSource parquetMetadataSource, boolean columnIndexFilterEnabled) {
AggregatedMemoryContext systemMemoryContext = newSimpleAggregatedMemoryContext();
ParquetDataSource dataSource = null;
try {
FSDataInputStream inputStream = hdfsEnvironment.getFileSystem(user, path, configuration).openFile(path, hiveFileContext);
dataSource = buildHdfsParquetDataSource(inputStream, path, stats);
ParquetMetadata parquetMetadata = parquetMetadataSource.getParquetMetadata(dataSource, fileSize, hiveFileContext.isCacheable()).getParquetMetadata();
if (!columns.isEmpty() && columns.stream().allMatch(hiveColumnHandle -> hiveColumnHandle.getColumnType() == AGGREGATED)) {
return new AggregatedParquetPageSource(columns, parquetMetadata, typeManager, functionResolution);
}
FileMetaData fileMetaData = parquetMetadata.getFileMetaData();
MessageType fileSchema = fileMetaData.getSchema();
Optional<MessageType> message = columns.stream().filter(column -> column.getColumnType() == REGULAR || isPushedDownSubfield(column)).map(column -> getColumnType(typeManager.getType(column.getTypeSignature()), fileSchema, useParquetColumnNames, column, tableName, path)).filter(Optional::isPresent).map(Optional::get).map(type -> new MessageType(fileSchema.getName(), type)).reduce(MessageType::union);
MessageType requestedSchema = message.orElse(new MessageType(fileSchema.getName(), ImmutableList.of()));
ImmutableList.Builder<BlockMetaData> footerBlocks = ImmutableList.builder();
for (BlockMetaData block : parquetMetadata.getBlocks()) {
long firstDataPage = block.getColumns().get(0).getFirstDataPageOffset();
if (firstDataPage >= start && firstDataPage < start + length) {
footerBlocks.add(block);
}
}
Map<List<String>, RichColumnDescriptor> descriptorsByPath = getDescriptors(fileSchema, requestedSchema);
TupleDomain<ColumnDescriptor> parquetTupleDomain = getParquetTupleDomain(descriptorsByPath, effectivePredicate);
Predicate parquetPredicate = buildPredicate(requestedSchema, parquetTupleDomain, descriptorsByPath);
final ParquetDataSource finalDataSource = dataSource;
ImmutableList.Builder<BlockMetaData> blocks = ImmutableList.builder();
List<ColumnIndexStore> blockIndexStores = new ArrayList<>();
for (BlockMetaData block : footerBlocks.build()) {
Optional<ColumnIndexStore> columnIndexStore = ColumnIndexFilterUtils.getColumnIndexStore(parquetPredicate, finalDataSource, block, descriptorsByPath, columnIndexFilterEnabled);
if (predicateMatches(parquetPredicate, block, finalDataSource, descriptorsByPath, parquetTupleDomain, columnIndexStore, columnIndexFilterEnabled)) {
blocks.add(block);
blockIndexStores.add(columnIndexStore.orElse(null));
hiveFileContext.incrementCounter("parquet.blocksRead", 1);
hiveFileContext.incrementCounter("parquet.rowsRead", block.getRowCount());
hiveFileContext.incrementCounter("parquet.totalBytesRead", block.getTotalByteSize());
} else {
hiveFileContext.incrementCounter("parquet.blocksSkipped", 1);
hiveFileContext.incrementCounter("parquet.rowsSkipped", block.getRowCount());
hiveFileContext.incrementCounter("parquet.totalBytesSkipped", block.getTotalByteSize());
}
}
MessageColumnIO messageColumnIO = getColumnIO(fileSchema, requestedSchema);
ParquetReader parquetReader = new ParquetReader(messageColumnIO, blocks.build(), dataSource, systemMemoryContext, maxReadBlockSize, batchReaderEnabled, verificationEnabled, parquetPredicate, blockIndexStores, columnIndexFilterEnabled);
ImmutableList.Builder<String> namesBuilder = ImmutableList.builder();
ImmutableList.Builder<Type> typesBuilder = ImmutableList.builder();
ImmutableList.Builder<Optional<Field>> fieldsBuilder = ImmutableList.builder();
for (HiveColumnHandle column : columns) {
checkArgument(column.getColumnType() == REGULAR || column.getColumnType() == SYNTHESIZED, "column type must be regular or synthesized column");
String name = column.getName();
Type type = typeManager.getType(column.getTypeSignature());
namesBuilder.add(name);
typesBuilder.add(type);
if (column.getColumnType() == SYNTHESIZED) {
Subfield pushedDownSubfield = getPushedDownSubfield(column);
List<String> nestedColumnPath = nestedColumnPath(pushedDownSubfield);
Optional<ColumnIO> columnIO = findNestedColumnIO(lookupColumnByName(messageColumnIO, pushedDownSubfield.getRootName()), nestedColumnPath);
if (columnIO.isPresent()) {
fieldsBuilder.add(constructField(type, columnIO.get()));
} else {
fieldsBuilder.add(Optional.empty());
}
} else if (getParquetType(type, fileSchema, useParquetColumnNames, column, tableName, path).isPresent()) {
String columnName = useParquetColumnNames ? name : fileSchema.getFields().get(column.getHiveColumnIndex()).getName();
fieldsBuilder.add(constructField(type, lookupColumnByName(messageColumnIO, columnName)));
} else {
fieldsBuilder.add(Optional.empty());
}
}
return new ParquetPageSource(parquetReader, typesBuilder.build(), fieldsBuilder.build(), namesBuilder.build(), hiveFileContext.getStats());
} catch (Exception e) {
try {
if (dataSource != null) {
dataSource.close();
}
} catch (IOException ignored) {
}
if (e instanceof PrestoException) {
throw (PrestoException) e;
}
if (e instanceof ParquetCorruptionException) {
throw new PrestoException(HIVE_BAD_DATA, e);
}
if (e instanceof AccessControlException) {
throw new PrestoException(PERMISSION_DENIED, e.getMessage(), e);
}
if (nullToEmpty(e.getMessage()).trim().equals("Filesystem closed") || e instanceof FileNotFoundException) {
throw new PrestoException(HIVE_CANNOT_OPEN_SPLIT, e);
}
String message = format("Error opening Hive split %s (offset=%s, length=%s): %s", path, start, length, e.getMessage());
if (e.getClass().getSimpleName().equals("BlockMissingException")) {
throw new PrestoException(HIVE_MISSING_DATA, message, e);
}
throw new PrestoException(HIVE_CANNOT_OPEN_SPLIT, message, e);
}
}
Aggregations