use of com.facebook.presto.hive.orc.HdfsOrcDataSource in project presto by prestodb.
the class SortingFileWriter method mergeFiles.
private void mergeFiles(Iterable<TempFile> files, Consumer<Page> consumer) {
try (Closer closer = Closer.create()) {
Collection<Iterator<Page>> iterators = new ArrayList<>();
for (TempFile tempFile : files) {
Path file = tempFile.getPath();
OrcDataSource dataSource = new HdfsOrcDataSource(new OrcDataSourceId(file.toString()), fileSystem.getFileStatus(file).getLen(), new DataSize(1, MEGABYTE), new DataSize(8, MEGABYTE), new DataSize(8, MEGABYTE), false, fileSystem.open(file), new FileFormatDataSourceStats());
closer.register(dataSource);
iterators.add(new TempFileReader(types, dataSource));
}
new MergingPageIterator(iterators, types, sortFields, sortOrders).forEachRemaining(consumer);
if (!sortedWriteToTempPathEnabled) {
for (TempFile tempFile : files) {
Path file = tempFile.getPath();
fileSystem.delete(file, false);
if (fileSystem.exists(file)) {
throw new IOException("Failed to delete temporary file: " + file);
}
}
}
} catch (IOException e) {
throw new UncheckedIOException(e);
}
}
use of com.facebook.presto.hive.orc.HdfsOrcDataSource in project presto by prestodb.
the class IcebergFileWriterFactory method createOrcWriter.
private IcebergFileWriter createOrcWriter(Path outputPath, Schema icebergSchema, JobConf jobConf, ConnectorSession session) {
try {
FileSystem fileSystem = hdfsEnvironment.getFileSystem(session.getUser(), outputPath, jobConf);
DataSink orcDataSink = hdfsEnvironment.doAs(session.getUser(), () -> new OutputStreamDataSink(fileSystem.create(outputPath)));
Callable<Void> rollbackAction = () -> {
hdfsEnvironment.doAs(session.getUser(), () -> fileSystem.delete(outputPath, false));
return null;
};
List<Types.NestedField> columnFields = icebergSchema.columns();
List<String> fileColumnNames = columnFields.stream().map(Types.NestedField::name).collect(toImmutableList());
List<Type> fileColumnTypes = columnFields.stream().map(Types.NestedField::type).map(type -> toPrestoType(type, typeManager)).collect(toImmutableList());
Optional<Supplier<OrcDataSource>> validationInputFactory = Optional.empty();
if (isOrcOptimizedWriterValidate(session)) {
validationInputFactory = Optional.of(() -> {
try {
return new HdfsOrcDataSource(new OrcDataSourceId(outputPath.toString()), hdfsEnvironment.doAs(session.getUser(), () -> fileSystem.getFileStatus(outputPath).getLen()), getOrcMaxMergeDistance(session), getOrcMaxBufferSize(session), getOrcStreamBufferSize(session), false, hdfsEnvironment.doAs(session.getUser(), () -> fileSystem.open(outputPath)), readStats);
} catch (IOException e) {
throw new PrestoException(ICEBERG_WRITE_VALIDATION_FAILED, e);
}
});
}
return new IcebergOrcFileWriter(icebergSchema, orcDataSink, rollbackAction, ORC, fileColumnNames, fileColumnTypes, toOrcType(icebergSchema), getCompressionCodec(session).getOrcCompressionKind(), orcFileWriterConfig.toOrcWriterOptionsBuilder().withFlushPolicy(DefaultOrcWriterFlushPolicy.builder().withStripeMinSize(HiveSessionProperties.getOrcOptimizedWriterMinStripeSize(session)).withStripeMaxSize(HiveSessionProperties.getOrcOptimizedWriterMaxStripeSize(session)).withStripeMaxRowCount(HiveSessionProperties.getOrcOptimizedWriterMaxStripeRows(session)).build()).withDictionaryMaxMemory(HiveSessionProperties.getOrcOptimizedWriterMaxDictionaryMemory(session)).withMaxStringStatisticsLimit(HiveSessionProperties.getOrcStringStatisticsLimit(session)).build(), IntStream.range(0, fileColumnNames.size()).toArray(), ImmutableMap.<String, String>builder().put(PRESTO_VERSION_NAME, nodeVersion.toString()).put(PRESTO_QUERY_ID_NAME, session.getQueryId()).build(), UTC, validationInputFactory, getOrcOptimizedWriterValidateMode(session), orcWriterStats, dwrfEncryptionProvider, Optional.empty());
} catch (IOException e) {
throw new PrestoException(ICEBERG_WRITER_OPEN_ERROR, "Error creating ORC file", e);
}
}
use of com.facebook.presto.hive.orc.HdfsOrcDataSource in project presto by prestodb.
the class IcebergPageSourceProvider method createBatchOrcPageSource.
private static ConnectorPageSource createBatchOrcPageSource(HdfsEnvironment hdfsEnvironment, String user, Configuration configuration, Path path, long start, long length, boolean isCacheable, List<IcebergColumnHandle> regularColumns, TypeManager typeManager, TupleDomain<IcebergColumnHandle> effectivePredicate, OrcReaderOptions options, OrcEncoding orcEncoding, DataSize maxBufferSize, DataSize streamBufferSize, boolean lazyReadSmallRanges, boolean orcBloomFiltersEnabled, int domainCompactionThreshold, OrcFileTailSource orcFileTailSource, StripeMetadataSourceFactory stripeMetadataSourceFactory, FileFormatDataSourceStats stats, Optional<EncryptionInformation> encryptionInformation, DwrfEncryptionProvider dwrfEncryptionProvider) {
OrcDataSource orcDataSource = null;
try {
ExtendedFileSystem fileSystem = hdfsEnvironment.getFileSystem(user, path, configuration);
FileStatus fileStatus = fileSystem.getFileStatus(path);
long fileSize = fileStatus.getLen();
long modificationTime = fileStatus.getModificationTime();
HiveFileContext hiveFileContext = new HiveFileContext(true, NO_CACHE_CONSTRAINTS, Optional.empty(), Optional.of(fileSize), modificationTime, false);
FSDataInputStream inputStream = hdfsEnvironment.doAs(user, () -> fileSystem.openFile(path, hiveFileContext));
orcDataSource = new HdfsOrcDataSource(new OrcDataSourceId(path.toString()), fileSize, options.getMaxMergeDistance(), maxBufferSize, streamBufferSize, lazyReadSmallRanges, inputStream, stats);
// Todo: pass real columns to ProjectionBasedDwrfKeyProvider instead of ImmutableList.of()
DwrfKeyProvider dwrfKeyProvider = new ProjectionBasedDwrfKeyProvider(encryptionInformation, ImmutableList.of(), true, path);
RuntimeStats runtimeStats = new RuntimeStats();
OrcReader reader = new OrcReader(orcDataSource, orcEncoding, orcFileTailSource, stripeMetadataSourceFactory, new HiveOrcAggregatedMemoryContext(), options, isCacheable, dwrfEncryptionProvider, dwrfKeyProvider, runtimeStats);
List<HiveColumnHandle> physicalColumnHandles = new ArrayList<>(regularColumns.size());
ImmutableMap.Builder<Integer, Type> includedColumns = ImmutableMap.builder();
ImmutableList.Builder<TupleDomainOrcPredicate.ColumnReference<HiveColumnHandle>> columnReferences = ImmutableList.builder();
List<IcebergOrcColumn> fileOrcColumns = getFileOrcColumns(reader);
Map<Integer, IcebergOrcColumn> fileOrcColumnByIcebergId = fileOrcColumns.stream().filter(orcColumn -> orcColumn.getAttributes().containsKey(ORC_ICEBERG_ID_KEY)).collect(toImmutableMap(orcColumn -> Integer.parseInt(orcColumn.getAttributes().get(ORC_ICEBERG_ID_KEY)), orcColumn -> IcebergOrcColumn.copy(orcColumn).setIcebergColumnId(Optional.of(Integer.parseInt(orcColumn.getAttributes().get(ORC_ICEBERG_ID_KEY))))));
Map<String, IcebergOrcColumn> fileOrcColumnsByName = uniqueIndex(fileOrcColumns, orcColumn -> orcColumn.getColumnName().toLowerCase(ENGLISH));
int nextMissingColumnIndex = fileOrcColumnsByName.size();
for (IcebergColumnHandle column : regularColumns) {
IcebergOrcColumn icebergOrcColumn;
boolean isExcludeColumn = false;
if (fileOrcColumnByIcebergId.isEmpty()) {
icebergOrcColumn = fileOrcColumnsByName.get(column.getName());
} else {
icebergOrcColumn = fileOrcColumnByIcebergId.get(column.getId());
if (icebergOrcColumn == null) {
// Cannot get orc column from 'fileOrcColumnByIcebergId', which means SchemaEvolution may have happened, so we get orc column by column name.
icebergOrcColumn = fileOrcColumnsByName.get(column.getName());
if (icebergOrcColumn != null) {
isExcludeColumn = true;
}
}
}
if (icebergOrcColumn != null) {
HiveColumnHandle columnHandle = new HiveColumnHandle(// Todo: using orc file column name
column.getName(), toHiveType(column.getType()), column.getType().getTypeSignature(), icebergOrcColumn.getOrcColumnId(), icebergOrcColumn.getColumnType(), Optional.empty(), Optional.empty());
physicalColumnHandles.add(columnHandle);
// Skip SchemaEvolution column
if (!isExcludeColumn) {
includedColumns.put(columnHandle.getHiveColumnIndex(), typeManager.getType(columnHandle.getTypeSignature()));
columnReferences.add(new TupleDomainOrcPredicate.ColumnReference<>(columnHandle, columnHandle.getHiveColumnIndex(), typeManager.getType(columnHandle.getTypeSignature())));
}
} else {
physicalColumnHandles.add(new HiveColumnHandle(column.getName(), toHiveType(column.getType()), column.getType().getTypeSignature(), nextMissingColumnIndex++, REGULAR, Optional.empty(), Optional.empty()));
}
}
TupleDomain<HiveColumnHandle> hiveColumnHandleTupleDomain = effectivePredicate.transform(column -> {
IcebergOrcColumn icebergOrcColumn;
if (fileOrcColumnByIcebergId.isEmpty()) {
icebergOrcColumn = fileOrcColumnsByName.get(column.getName());
} else {
icebergOrcColumn = fileOrcColumnByIcebergId.get(column.getId());
if (icebergOrcColumn == null) {
// Cannot get orc column from 'fileOrcColumnByIcebergId', which means SchemaEvolution may have happened, so we get orc column by column name.
icebergOrcColumn = fileOrcColumnsByName.get(column.getName());
}
}
return new HiveColumnHandle(column.getName(), toHiveType(column.getType()), column.getType().getTypeSignature(), // Note: the HiveColumnHandle.hiveColumnIndex starts from '0' while the IcebergColumnHandle.id starts from '1'
icebergOrcColumn != null ? icebergOrcColumn.getOrcColumnId() : column.getId() - 1, icebergOrcColumn != null ? icebergOrcColumn.getColumnType() : REGULAR, Optional.empty(), Optional.empty());
});
OrcPredicate predicate = new TupleDomainOrcPredicate<>(hiveColumnHandleTupleDomain, columnReferences.build(), orcBloomFiltersEnabled, Optional.of(domainCompactionThreshold));
OrcAggregatedMemoryContext systemMemoryUsage = new HiveOrcAggregatedMemoryContext();
OrcBatchRecordReader recordReader = reader.createBatchRecordReader(includedColumns.build(), predicate, start, length, UTC, systemMemoryUsage, INITIAL_BATCH_SIZE);
return new OrcBatchPageSource(recordReader, orcDataSource, physicalColumnHandles, typeManager, systemMemoryUsage, stats, runtimeStats);
} catch (Exception e) {
if (orcDataSource != null) {
try {
orcDataSource.close();
} catch (IOException ignored) {
}
}
if (e instanceof PrestoException) {
throw (PrestoException) e;
}
String message = format("Error opening Iceberg split %s (offset=%s, length=%s): %s", path, start, length, e.getMessage());
if (e instanceof BlockMissingException) {
throw new PrestoException(ICEBERG_MISSING_DATA, message, e);
}
throw new PrestoException(ICEBERG_CANNOT_OPEN_SPLIT, message, e);
}
}
use of com.facebook.presto.hive.orc.HdfsOrcDataSource in project presto by prestodb.
the class OrcFileWriterFactory method createFileWriter.
@Override
public Optional<HiveFileWriter> createFileWriter(Path path, List<String> inputColumnNames, StorageFormat storageFormat, Properties schema, JobConf configuration, ConnectorSession session, Optional<EncryptionInformation> encryptionInformation) {
if (!HiveSessionProperties.isOrcOptimizedWriterEnabled(session)) {
return Optional.empty();
}
OrcEncoding orcEncoding;
if (OrcOutputFormat.class.getName().equals(storageFormat.getOutputFormat())) {
orcEncoding = ORC;
} else if (com.facebook.hive.orc.OrcOutputFormat.class.getName().equals(storageFormat.getOutputFormat())) {
orcEncoding = DWRF;
} else {
return Optional.empty();
}
CompressionKind compression = getCompression(schema, configuration, orcEncoding);
// existing tables and partitions may have columns in a different order than the writer is providing, so build
// an index to rearrange columns in the proper order
List<String> fileColumnNames = Splitter.on(',').trimResults().omitEmptyStrings().splitToList(schema.getProperty(META_TABLE_COLUMNS, ""));
List<Type> fileColumnTypes = toHiveTypes(schema.getProperty(META_TABLE_COLUMN_TYPES, "")).stream().map(hiveType -> hiveType.getType(typeManager)).collect(toList());
int[] fileInputColumnIndexes = fileColumnNames.stream().mapToInt(inputColumnNames::indexOf).toArray();
try {
FileSystem fileSystem = hdfsEnvironment.getFileSystem(session.getUser(), path, configuration);
DataSink dataSink = createDataSink(session, fileSystem, path);
Optional<Supplier<OrcDataSource>> validationInputFactory = Optional.empty();
if (HiveSessionProperties.isOrcOptimizedWriterValidate(session)) {
validationInputFactory = Optional.of(() -> {
try {
return new HdfsOrcDataSource(new OrcDataSourceId(path.toString()), fileSystem.getFileStatus(path).getLen(), getOrcMaxMergeDistance(session), getOrcMaxBufferSize(session), getOrcStreamBufferSize(session), false, fileSystem.open(path), readStats);
} catch (IOException e) {
throw new PrestoException(HIVE_WRITE_VALIDATION_FAILED, e);
}
});
}
Callable<Void> rollbackAction = () -> {
fileSystem.delete(path, false);
return null;
};
Optional<DwrfWriterEncryption> dwrfWriterEncryption = createDwrfEncryption(encryptionInformation, fileColumnNames, fileColumnTypes);
return Optional.of(new OrcFileWriter(dataSink, rollbackAction, orcEncoding, fileColumnNames, fileColumnTypes, compression, orcFileWriterConfig.toOrcWriterOptionsBuilder().withFlushPolicy(DefaultOrcWriterFlushPolicy.builder().withStripeMinSize(getOrcOptimizedWriterMinStripeSize(session)).withStripeMaxSize(getOrcOptimizedWriterMaxStripeSize(session)).withStripeMaxRowCount(getOrcOptimizedWriterMaxStripeRows(session)).build()).withDictionaryMaxMemory(getOrcOptimizedWriterMaxDictionaryMemory(session)).withMaxStringStatisticsLimit(getOrcStringStatisticsLimit(session)).withIgnoreDictionaryRowGroupSizes(isExecutionBasedMemoryAccountingEnabled(session)).withDwrfStripeCacheEnabled(isDwrfWriterStripeCacheEnabled(session)).withDwrfStripeCacheMaxSize(getDwrfWriterStripeCacheeMaxSize(session)).build(), fileInputColumnIndexes, ImmutableMap.<String, String>builder().put(HiveMetadata.PRESTO_VERSION_NAME, nodeVersion.toString()).put(MetastoreUtil.PRESTO_QUERY_ID_NAME, session.getQueryId()).build(), hiveStorageTimeZone, validationInputFactory, getOrcOptimizedWriterValidateMode(session), stats, dwrfEncryptionProvider, dwrfWriterEncryption));
} catch (IOException e) {
throw new PrestoException(HIVE_WRITER_OPEN_ERROR, "Error creating " + orcEncoding + " file. " + e.getMessage(), e);
}
}
Aggregations