use of com.facebook.presto.common.RuntimeStats in project presto by prestodb.
the class TestOrcReaderDwrfStripeCaching method getDwrfStripeCache.
private Optional<DwrfStripeCache> getDwrfStripeCache(File orcFile) throws IOException {
CapturingStripeMetadataSourceFactory stripeMetadataSourceFactory = new CapturingStripeMetadataSourceFactory();
OrcDataSource orcDataSource = new FileOrcDataSource(orcFile, new DataSize(1, MEGABYTE), new DataSize(1, MEGABYTE), new DataSize(1, MEGABYTE), true);
new OrcReader(orcDataSource, DWRF, new StorageOrcFileTailSource(READ_TAIL_SIZE_IN_BYTES, true), stripeMetadataSourceFactory, NOOP_ORC_AGGREGATED_MEMORY_CONTEXT, OrcReaderTestingUtils.createDefaultTestConfig(), false, NO_ENCRYPTION, DwrfKeyProvider.EMPTY, new RuntimeStats());
return stripeMetadataSourceFactory.getDwrfStripeCache();
}
use of com.facebook.presto.common.RuntimeStats in project presto by prestodb.
the class TestStructBatchStreamReader method read.
private RowBlock read(TempFile tempFile, Type readerType) throws IOException {
DataSize dataSize = new DataSize(1, MEGABYTE);
OrcDataSource orcDataSource = new FileOrcDataSource(tempFile.getFile(), dataSize, dataSize, dataSize, true);
OrcReader orcReader = new OrcReader(orcDataSource, ORC, new StorageOrcFileTailSource(), new StorageStripeMetadataSource(), NOOP_ORC_AGGREGATED_MEMORY_CONTEXT, new OrcReaderOptions(dataSize, dataSize, dataSize, false), false, NO_ENCRYPTION, DwrfKeyProvider.EMPTY, new RuntimeStats());
Map<Integer, Type> includedColumns = new HashMap<>();
includedColumns.put(0, readerType);
OrcBatchRecordReader recordReader = orcReader.createBatchRecordReader(includedColumns, OrcPredicate.TRUE, UTC, new TestingHiveOrcAggregatedMemoryContext(), OrcReader.INITIAL_BATCH_SIZE);
recordReader.nextBatch();
RowBlock block = (RowBlock) recordReader.readBlock(0);
recordReader.close();
return block;
}
use of com.facebook.presto.common.RuntimeStats in project presto by prestodb.
the class DeltaPageSourceProvider method createParquetPageSource.
private static ConnectorPageSource createParquetPageSource(HdfsEnvironment hdfsEnvironment, String user, Configuration configuration, Path path, long start, long length, long fileSize, List<DeltaColumnHandle> columns, SchemaTableName tableName, DataSize maxReadBlockSize, boolean batchReaderEnabled, boolean verificationEnabled, TypeManager typeManager, TupleDomain<DeltaColumnHandle> effectivePredicate, FileFormatDataSourceStats stats, boolean columnIndexFilterEnabled) {
AggregatedMemoryContext systemMemoryContext = newSimpleAggregatedMemoryContext();
ParquetDataSource dataSource = null;
try {
FSDataInputStream inputStream = hdfsEnvironment.getFileSystem(user, path, configuration).open(path);
dataSource = buildHdfsParquetDataSource(inputStream, path, stats);
ParquetMetadata parquetMetadata = MetadataReader.readFooter(dataSource, fileSize).getParquetMetadata();
FileMetaData fileMetaData = parquetMetadata.getFileMetaData();
MessageType fileSchema = fileMetaData.getSchema();
Optional<MessageType> message = columns.stream().filter(column -> column.getColumnType() == REGULAR || isPushedDownSubfield(column)).map(column -> getColumnType(typeManager.getType(column.getDataType()), fileSchema, column, tableName, path)).filter(Optional::isPresent).map(Optional::get).map(type -> new MessageType(fileSchema.getName(), type)).reduce(MessageType::union);
MessageType requestedSchema = message.orElse(new MessageType(fileSchema.getName(), ImmutableList.of()));
ImmutableList.Builder<BlockMetaData> footerBlocks = ImmutableList.builder();
for (BlockMetaData block : parquetMetadata.getBlocks()) {
long firstDataPage = block.getColumns().get(0).getFirstDataPageOffset();
if (firstDataPage >= start && firstDataPage < start + length) {
footerBlocks.add(block);
}
}
Map<List<String>, RichColumnDescriptor> descriptorsByPath = getDescriptors(fileSchema, requestedSchema);
TupleDomain<ColumnDescriptor> parquetTupleDomain = getParquetTupleDomain(descriptorsByPath, effectivePredicate);
Predicate parquetPredicate = buildPredicate(requestedSchema, parquetTupleDomain, descriptorsByPath);
final ParquetDataSource finalDataSource = dataSource;
ImmutableList.Builder<BlockMetaData> blocks = ImmutableList.builder();
List<ColumnIndexStore> blockIndexStores = new ArrayList<>();
for (BlockMetaData block : footerBlocks.build()) {
Optional<ColumnIndexStore> columnIndexStore = ColumnIndexFilterUtils.getColumnIndexStore(parquetPredicate, finalDataSource, block, descriptorsByPath, columnIndexFilterEnabled);
if (predicateMatches(parquetPredicate, block, finalDataSource, descriptorsByPath, parquetTupleDomain, columnIndexStore, columnIndexFilterEnabled)) {
blocks.add(block);
blockIndexStores.add(columnIndexStore.orElse(null));
}
}
MessageColumnIO messageColumnIO = getColumnIO(fileSchema, requestedSchema);
ParquetReader parquetReader = new ParquetReader(messageColumnIO, blocks.build(), dataSource, systemMemoryContext, maxReadBlockSize, batchReaderEnabled, verificationEnabled, parquetPredicate, blockIndexStores, columnIndexFilterEnabled);
ImmutableList.Builder<String> namesBuilder = ImmutableList.builder();
ImmutableList.Builder<Type> typesBuilder = ImmutableList.builder();
ImmutableList.Builder<Optional<Field>> fieldsBuilder = ImmutableList.builder();
for (DeltaColumnHandle column : columns) {
checkArgument(column.getColumnType() == REGULAR || column.getColumnType() == SUBFIELD, "column type must be regular or subfield column");
String name = column.getName();
Type type = typeManager.getType(column.getDataType());
namesBuilder.add(name);
typesBuilder.add(type);
if (isPushedDownSubfield(column)) {
Subfield pushedDownSubfield = getPushedDownSubfield(column);
List<String> nestedColumnPath = nestedColumnPath(pushedDownSubfield);
Optional<ColumnIO> columnIO = findNestedColumnIO(lookupColumnByName(messageColumnIO, pushedDownSubfield.getRootName()), nestedColumnPath);
if (columnIO.isPresent()) {
fieldsBuilder.add(constructField(type, columnIO.get()));
} else {
fieldsBuilder.add(Optional.empty());
}
} else if (getParquetType(type, fileSchema, column, tableName, path).isPresent()) {
fieldsBuilder.add(constructField(type, lookupColumnByName(messageColumnIO, name)));
} else {
fieldsBuilder.add(Optional.empty());
}
}
return new ParquetPageSource(parquetReader, typesBuilder.build(), fieldsBuilder.build(), namesBuilder.build(), new RuntimeStats());
} catch (Exception exception) {
try {
if (dataSource != null) {
dataSource.close();
}
} catch (IOException ignored) {
}
if (exception instanceof PrestoException) {
throw (PrestoException) exception;
}
if (exception instanceof ParquetCorruptionException) {
throw new PrestoException(DELTA_BAD_DATA, exception);
}
if (exception instanceof AccessControlException) {
throw new PrestoException(PERMISSION_DENIED, exception.getMessage(), exception);
}
if (nullToEmpty(exception.getMessage()).trim().equals("Filesystem closed") || exception instanceof FileNotFoundException) {
throw new PrestoException(DELTA_CANNOT_OPEN_SPLIT, exception);
}
String message = format("Error opening Hive split %s (offset=%s, length=%s): %s", path, start, length, exception.getMessage());
if (exception.getClass().getSimpleName().equals("BlockMissingException")) {
throw new PrestoException(DELTA_MISSING_DATA, message, exception);
}
throw new PrestoException(DELTA_CANNOT_OPEN_SPLIT, message, exception);
}
}
use of com.facebook.presto.common.RuntimeStats in project presto by prestodb.
the class OrcStorageManager method getRowsFromUuid.
Optional<BitSet> getRowsFromUuid(FileSystem fileSystem, Optional<UUID> deltaShardUuid) {
if (!deltaShardUuid.isPresent()) {
return Optional.empty();
}
try (OrcDataSource dataSource = openShard(fileSystem, deltaShardUuid.get(), defaultReaderAttributes)) {
OrcAggregatedMemoryContext systemMemoryUsage = new RaptorOrcAggregatedMemoryContext();
OrcReader reader = new OrcReader(dataSource, ORC, orcFileTailSource, new StorageStripeMetadataSource(), new RaptorOrcAggregatedMemoryContext(), new OrcReaderOptions(defaultReaderAttributes.getMaxMergeDistance(), defaultReaderAttributes.getTinyStripeThreshold(), HUGE_MAX_READ_BLOCK_SIZE, defaultReaderAttributes.isZstdJniDecompressionEnabled()), false, NO_ENCRYPTION, DwrfKeyProvider.EMPTY, new RuntimeStats());
if (reader.getFooter().getNumberOfRows() >= Integer.MAX_VALUE) {
throw new IOException("File has too many rows");
}
try (OrcBatchRecordReader recordReader = reader.createBatchRecordReader(ImmutableMap.of(0, BIGINT), OrcPredicate.TRUE, DEFAULT_STORAGE_TIMEZONE, systemMemoryUsage, INITIAL_BATCH_SIZE)) {
BitSet bitSet = new BitSet();
while (recordReader.nextBatch() > 0) {
Block block = recordReader.readBlock(0);
for (int i = 0; i < block.getPositionCount(); i++) {
bitSet.set(toIntExact(block.getLong(i)));
}
}
return Optional.of(bitSet);
}
} catch (IOException | RuntimeException e) {
throw new PrestoException(RAPTOR_ERROR, "Failed to read file: " + deltaShardUuid, e);
}
}
use of com.facebook.presto.common.RuntimeStats in project presto by prestodb.
the class OrcStorageManager method computeShardStats.
private List<ColumnStats> computeShardStats(FileSystem fileSystem, Path file) {
try (OrcDataSource dataSource = orcDataEnvironment.createOrcDataSource(fileSystem, file, defaultReaderAttributes)) {
OrcReader reader = new OrcReader(dataSource, ORC, orcFileTailSource, stripeMetadataSourceFactory, new RaptorOrcAggregatedMemoryContext(), new OrcReaderOptions(defaultReaderAttributes.getMaxMergeDistance(), defaultReaderAttributes.getTinyStripeThreshold(), HUGE_MAX_READ_BLOCK_SIZE, defaultReaderAttributes.isZstdJniDecompressionEnabled()), false, NO_ENCRYPTION, DwrfKeyProvider.EMPTY, new RuntimeStats());
ImmutableList.Builder<ColumnStats> list = ImmutableList.builder();
for (ColumnInfo info : getColumnInfo(reader)) {
computeColumnStats(reader, info.getColumnId(), info.getType(), typeManager).ifPresent(list::add);
}
return list.build();
} catch (IOException e) {
throw new PrestoException(RAPTOR_ERROR, "Failed to read file: " + file, e);
}
}
Aggregations