use of com.facebook.presto.hive.HiveColumnHandle.ColumnType.REGULAR in project presto by prestodb.
the class IcebergPageSourceProvider method getFileOrcColumns.
private static List<IcebergOrcColumn> getFileOrcColumns(OrcReader reader) {
List<OrcType> orcTypes = reader.getFooter().getTypes();
OrcType rootOrcType = orcTypes.get(ROOT_COLUMN_ID);
List<IcebergOrcColumn> columnAttributes = ImmutableList.of();
if (rootOrcType.getOrcTypeKind() == OrcType.OrcTypeKind.STRUCT) {
columnAttributes = IntStream.range(0, rootOrcType.getFieldCount()).mapToObj(fieldId -> new IcebergOrcColumn(fieldId, rootOrcType.getFieldTypeIndex(fieldId), // so we use 'Optional.empty()' temporarily.
Optional.empty(), rootOrcType.getFieldName(fieldId), REGULAR, orcTypes.get(rootOrcType.getFieldTypeIndex(fieldId)).getOrcTypeKind(), orcTypes.get(rootOrcType.getFieldTypeIndex(fieldId)).getAttributes())).collect(toImmutableList());
} else if (rootOrcType.getOrcTypeKind() == OrcType.OrcTypeKind.LIST) {
columnAttributes = ImmutableList.of(new IcebergOrcColumn(0, rootOrcType.getFieldTypeIndex(0), Optional.empty(), "item", REGULAR, orcTypes.get(rootOrcType.getFieldTypeIndex(0)).getOrcTypeKind(), orcTypes.get(rootOrcType.getFieldTypeIndex(0)).getAttributes()));
} else if (rootOrcType.getOrcTypeKind() == OrcType.OrcTypeKind.MAP) {
columnAttributes = ImmutableList.of(new IcebergOrcColumn(0, rootOrcType.getFieldTypeIndex(0), Optional.empty(), "key", REGULAR, orcTypes.get(rootOrcType.getFieldTypeIndex(0)).getOrcTypeKind(), orcTypes.get(rootOrcType.getFieldTypeIndex(0)).getAttributes()), new IcebergOrcColumn(1, rootOrcType.getFieldTypeIndex(1), Optional.empty(), "value", REGULAR, orcTypes.get(rootOrcType.getFieldTypeIndex(1)).getOrcTypeKind(), orcTypes.get(rootOrcType.getFieldTypeIndex(1)).getAttributes()));
} else if (rootOrcType.getOrcTypeKind() == OrcType.OrcTypeKind.UNION) {
columnAttributes = IntStream.range(0, rootOrcType.getFieldCount()).mapToObj(fieldId -> new IcebergOrcColumn(fieldId, rootOrcType.getFieldTypeIndex(fieldId), Optional.empty(), "field" + fieldId, REGULAR, orcTypes.get(rootOrcType.getFieldTypeIndex(fieldId)).getOrcTypeKind(), orcTypes.get(rootOrcType.getFieldTypeIndex(fieldId)).getAttributes())).collect(toImmutableList());
}
return columnAttributes;
}
use of com.facebook.presto.hive.HiveColumnHandle.ColumnType.REGULAR in project presto by prestodb.
the class ParquetHiveRecordCursor method createParquetRecordReader.
private ParquetRecordReader<FakeParquetRecord> createParquetRecordReader(HdfsEnvironment hdfsEnvironment, String sessionUser, Configuration configuration, Path path, long start, long length, List<HiveColumnHandle> columns, boolean useParquetColumnNames, TypeManager typeManager, boolean predicatePushdownEnabled, TupleDomain<HiveColumnHandle> effectivePredicate) {
ParquetDataSource dataSource = null;
try {
FileSystem fileSystem = hdfsEnvironment.getFileSystem(sessionUser, path, configuration);
dataSource = buildHdfsParquetDataSource(fileSystem, path, start, length);
ParquetMetadata parquetMetadata = hdfsEnvironment.doAs(sessionUser, () -> ParquetFileReader.readFooter(configuration, path, NO_FILTER));
List<BlockMetaData> blocks = parquetMetadata.getBlocks();
FileMetaData fileMetaData = parquetMetadata.getFileMetaData();
MessageType fileSchema = fileMetaData.getSchema();
PrestoReadSupport readSupport = new PrestoReadSupport(useParquetColumnNames, columns, fileSchema);
List<parquet.schema.Type> fields = columns.stream().filter(column -> column.getColumnType() == REGULAR).map(column -> getParquetType(column, fileSchema, useParquetColumnNames)).filter(Objects::nonNull).collect(toList());
MessageType requestedSchema = new MessageType(fileSchema.getName(), fields);
LongArrayList offsets = new LongArrayList(blocks.size());
for (BlockMetaData block : blocks) {
long firstDataPage = block.getColumns().get(0).getFirstDataPageOffset();
if (firstDataPage >= start && firstDataPage < start + length) {
if (predicatePushdownEnabled) {
ParquetPredicate parquetPredicate = buildParquetPredicate(columns, effectivePredicate, fileMetaData.getSchema(), typeManager);
if (predicateMatches(parquetPredicate, block, dataSource, requestedSchema, effectivePredicate)) {
offsets.add(block.getStartingPos());
}
} else {
offsets.add(block.getStartingPos());
}
}
}
ParquetInputSplit split = new ParquetInputSplit(path, start, start + length, length, null, offsets.toLongArray());
TaskAttemptContext taskContext = ContextUtil.newTaskAttemptContext(configuration, new TaskAttemptID());
return hdfsEnvironment.doAs(sessionUser, () -> {
ParquetRecordReader<FakeParquetRecord> realReader = new PrestoParquetRecordReader(readSupport);
realReader.initialize(split, taskContext);
return realReader;
});
} catch (Exception e) {
Throwables.propagateIfInstanceOf(e, PrestoException.class);
if (e instanceof InterruptedException) {
Thread.currentThread().interrupt();
throw Throwables.propagate(e);
}
String message = format("Error opening Hive split %s (offset=%s, length=%s): %s", path, start, length, e.getMessage());
if (e.getClass().getSimpleName().equals("BlockMissingException")) {
throw new PrestoException(HIVE_MISSING_DATA, message, e);
}
throw new PrestoException(HIVE_CANNOT_OPEN_SPLIT, message, e);
} finally {
if (dataSource != null) {
try {
dataSource.close();
} catch (IOException ignored) {
}
}
}
}
use of com.facebook.presto.hive.HiveColumnHandle.ColumnType.REGULAR in project presto by prestodb.
the class ParquetPageSourceFactory method createParquetPageSource.
public static ParquetPageSource createParquetPageSource(HdfsEnvironment hdfsEnvironment, String user, Configuration configuration, Path path, long start, long length, Properties schema, List<HiveColumnHandle> columns, boolean useParquetColumnNames, TypeManager typeManager, boolean predicatePushdownEnabled, TupleDomain<HiveColumnHandle> effectivePredicate) {
AggregatedMemoryContext systemMemoryContext = new AggregatedMemoryContext();
ParquetDataSource dataSource = null;
try {
FileSystem fileSystem = hdfsEnvironment.getFileSystem(user, path, configuration);
dataSource = buildHdfsParquetDataSource(fileSystem, path, start, length);
ParquetMetadata parquetMetadata = ParquetMetadataReader.readFooter(fileSystem, path);
FileMetaData fileMetaData = parquetMetadata.getFileMetaData();
MessageType fileSchema = fileMetaData.getSchema();
List<parquet.schema.Type> fields = columns.stream().filter(column -> column.getColumnType() == REGULAR).map(column -> getParquetType(column, fileSchema, useParquetColumnNames)).filter(Objects::nonNull).collect(toList());
MessageType requestedSchema = new MessageType(fileSchema.getName(), fields);
List<BlockMetaData> blocks = new ArrayList<>();
for (BlockMetaData block : parquetMetadata.getBlocks()) {
long firstDataPage = block.getColumns().get(0).getFirstDataPageOffset();
if (firstDataPage >= start && firstDataPage < start + length) {
blocks.add(block);
}
}
if (predicatePushdownEnabled) {
ParquetPredicate parquetPredicate = buildParquetPredicate(columns, effectivePredicate, fileMetaData.getSchema(), typeManager);
final ParquetDataSource finalDataSource = dataSource;
blocks = blocks.stream().filter(block -> predicateMatches(parquetPredicate, block, finalDataSource, requestedSchema, effectivePredicate)).collect(toList());
}
ParquetReader parquetReader = new ParquetReader(fileSchema, requestedSchema, blocks, dataSource, typeManager, systemMemoryContext);
return new ParquetPageSource(parquetReader, dataSource, fileSchema, requestedSchema, length, schema, columns, effectivePredicate, typeManager, useParquetColumnNames, systemMemoryContext);
} catch (Exception e) {
try {
if (dataSource != null) {
dataSource.close();
}
} catch (IOException ignored) {
}
if (e instanceof PrestoException) {
throw (PrestoException) e;
}
String message = format("Error opening Hive split %s (offset=%s, length=%s): %s", path, start, length, e.getMessage());
if (e.getClass().getSimpleName().equals("BlockMissingException")) {
throw new PrestoException(HIVE_MISSING_DATA, message, e);
}
throw new PrestoException(HIVE_CANNOT_OPEN_SPLIT, message, e);
}
}
use of com.facebook.presto.hive.HiveColumnHandle.ColumnType.REGULAR in project presto by prestodb.
the class HiveUtil method createRecordReader.
public static RecordReader<?, ?> createRecordReader(Configuration configuration, Path path, long start, long length, Properties schema, List<HiveColumnHandle> columns) {
// determine which hive columns we will read
List<HiveColumnHandle> readColumns = ImmutableList.copyOf(filter(columns, column -> column.getColumnType() == REGULAR));
List<Integer> readHiveColumnIndexes = ImmutableList.copyOf(transform(readColumns, HiveColumnHandle::getHiveColumnIndex));
// Tell hive the columns we would like to read, this lets hive optimize reading column oriented files
setReadColumns(configuration, readHiveColumnIndexes);
InputFormat<?, ?> inputFormat = getInputFormat(configuration, schema, true);
JobConf jobConf = new JobConf(configuration);
FileSplit fileSplit = new FileSplit(path, start, length, (String[]) null);
// propagate serialization configuration to getRecordReader
schema.stringPropertyNames().stream().filter(name -> name.startsWith("serialization.")).forEach(name -> jobConf.set(name, schema.getProperty(name)));
try {
return retry().stopOnIllegalExceptions().run("createRecordReader", () -> inputFormat.getRecordReader(fileSplit, jobConf, Reporter.NULL));
} catch (Exception e) {
throw new PrestoException(HIVE_CANNOT_OPEN_SPLIT, format("Error opening Hive split %s (offset=%s, length=%s) using %s: %s", path, start, length, getInputFormatName(schema), e.getMessage()), e);
}
}
use of com.facebook.presto.hive.HiveColumnHandle.ColumnType.REGULAR in project presto by prestodb.
the class ParquetPageSourceFactory method createParquetPageSource.
public static ConnectorPageSource createParquetPageSource(HdfsEnvironment hdfsEnvironment, String user, Configuration configuration, Path path, long start, long length, long fileSize, List<HiveColumnHandle> columns, SchemaTableName tableName, boolean useParquetColumnNames, DataSize maxReadBlockSize, boolean batchReaderEnabled, boolean verificationEnabled, TypeManager typeManager, StandardFunctionResolution functionResolution, TupleDomain<HiveColumnHandle> effectivePredicate, FileFormatDataSourceStats stats, HiveFileContext hiveFileContext, ParquetMetadataSource parquetMetadataSource, boolean columnIndexFilterEnabled) {
AggregatedMemoryContext systemMemoryContext = newSimpleAggregatedMemoryContext();
ParquetDataSource dataSource = null;
try {
FSDataInputStream inputStream = hdfsEnvironment.getFileSystem(user, path, configuration).openFile(path, hiveFileContext);
dataSource = buildHdfsParquetDataSource(inputStream, path, stats);
ParquetMetadata parquetMetadata = parquetMetadataSource.getParquetMetadata(dataSource, fileSize, hiveFileContext.isCacheable()).getParquetMetadata();
if (!columns.isEmpty() && columns.stream().allMatch(hiveColumnHandle -> hiveColumnHandle.getColumnType() == AGGREGATED)) {
return new AggregatedParquetPageSource(columns, parquetMetadata, typeManager, functionResolution);
}
FileMetaData fileMetaData = parquetMetadata.getFileMetaData();
MessageType fileSchema = fileMetaData.getSchema();
Optional<MessageType> message = columns.stream().filter(column -> column.getColumnType() == REGULAR || isPushedDownSubfield(column)).map(column -> getColumnType(typeManager.getType(column.getTypeSignature()), fileSchema, useParquetColumnNames, column, tableName, path)).filter(Optional::isPresent).map(Optional::get).map(type -> new MessageType(fileSchema.getName(), type)).reduce(MessageType::union);
MessageType requestedSchema = message.orElse(new MessageType(fileSchema.getName(), ImmutableList.of()));
ImmutableList.Builder<BlockMetaData> footerBlocks = ImmutableList.builder();
for (BlockMetaData block : parquetMetadata.getBlocks()) {
long firstDataPage = block.getColumns().get(0).getFirstDataPageOffset();
if (firstDataPage >= start && firstDataPage < start + length) {
footerBlocks.add(block);
}
}
Map<List<String>, RichColumnDescriptor> descriptorsByPath = getDescriptors(fileSchema, requestedSchema);
TupleDomain<ColumnDescriptor> parquetTupleDomain = getParquetTupleDomain(descriptorsByPath, effectivePredicate);
Predicate parquetPredicate = buildPredicate(requestedSchema, parquetTupleDomain, descriptorsByPath);
final ParquetDataSource finalDataSource = dataSource;
ImmutableList.Builder<BlockMetaData> blocks = ImmutableList.builder();
List<ColumnIndexStore> blockIndexStores = new ArrayList<>();
for (BlockMetaData block : footerBlocks.build()) {
Optional<ColumnIndexStore> columnIndexStore = ColumnIndexFilterUtils.getColumnIndexStore(parquetPredicate, finalDataSource, block, descriptorsByPath, columnIndexFilterEnabled);
if (predicateMatches(parquetPredicate, block, finalDataSource, descriptorsByPath, parquetTupleDomain, columnIndexStore, columnIndexFilterEnabled)) {
blocks.add(block);
blockIndexStores.add(columnIndexStore.orElse(null));
hiveFileContext.incrementCounter("parquet.blocksRead", 1);
hiveFileContext.incrementCounter("parquet.rowsRead", block.getRowCount());
hiveFileContext.incrementCounter("parquet.totalBytesRead", block.getTotalByteSize());
} else {
hiveFileContext.incrementCounter("parquet.blocksSkipped", 1);
hiveFileContext.incrementCounter("parquet.rowsSkipped", block.getRowCount());
hiveFileContext.incrementCounter("parquet.totalBytesSkipped", block.getTotalByteSize());
}
}
MessageColumnIO messageColumnIO = getColumnIO(fileSchema, requestedSchema);
ParquetReader parquetReader = new ParquetReader(messageColumnIO, blocks.build(), dataSource, systemMemoryContext, maxReadBlockSize, batchReaderEnabled, verificationEnabled, parquetPredicate, blockIndexStores, columnIndexFilterEnabled);
ImmutableList.Builder<String> namesBuilder = ImmutableList.builder();
ImmutableList.Builder<Type> typesBuilder = ImmutableList.builder();
ImmutableList.Builder<Optional<Field>> fieldsBuilder = ImmutableList.builder();
for (HiveColumnHandle column : columns) {
checkArgument(column.getColumnType() == REGULAR || column.getColumnType() == SYNTHESIZED, "column type must be regular or synthesized column");
String name = column.getName();
Type type = typeManager.getType(column.getTypeSignature());
namesBuilder.add(name);
typesBuilder.add(type);
if (column.getColumnType() == SYNTHESIZED) {
Subfield pushedDownSubfield = getPushedDownSubfield(column);
List<String> nestedColumnPath = nestedColumnPath(pushedDownSubfield);
Optional<ColumnIO> columnIO = findNestedColumnIO(lookupColumnByName(messageColumnIO, pushedDownSubfield.getRootName()), nestedColumnPath);
if (columnIO.isPresent()) {
fieldsBuilder.add(constructField(type, columnIO.get()));
} else {
fieldsBuilder.add(Optional.empty());
}
} else if (getParquetType(type, fileSchema, useParquetColumnNames, column, tableName, path).isPresent()) {
String columnName = useParquetColumnNames ? name : fileSchema.getFields().get(column.getHiveColumnIndex()).getName();
fieldsBuilder.add(constructField(type, lookupColumnByName(messageColumnIO, columnName)));
} else {
fieldsBuilder.add(Optional.empty());
}
}
return new ParquetPageSource(parquetReader, typesBuilder.build(), fieldsBuilder.build(), namesBuilder.build(), hiveFileContext.getStats());
} catch (Exception e) {
try {
if (dataSource != null) {
dataSource.close();
}
} catch (IOException ignored) {
}
if (e instanceof PrestoException) {
throw (PrestoException) e;
}
if (e instanceof ParquetCorruptionException) {
throw new PrestoException(HIVE_BAD_DATA, e);
}
if (e instanceof AccessControlException) {
throw new PrestoException(PERMISSION_DENIED, e.getMessage(), e);
}
if (nullToEmpty(e.getMessage()).trim().equals("Filesystem closed") || e instanceof FileNotFoundException) {
throw new PrestoException(HIVE_CANNOT_OPEN_SPLIT, e);
}
String message = format("Error opening Hive split %s (offset=%s, length=%s): %s", path, start, length, e.getMessage());
if (e.getClass().getSimpleName().equals("BlockMissingException")) {
throw new PrestoException(HIVE_MISSING_DATA, message, e);
}
throw new PrestoException(HIVE_CANNOT_OPEN_SPLIT, message, e);
}
}
Aggregations