use of com.facebook.presto.orc.OrcDataSourceId in project presto by prestodb.
the class IcebergFileWriterFactory method createOrcWriter.
private IcebergFileWriter createOrcWriter(Path outputPath, Schema icebergSchema, JobConf jobConf, ConnectorSession session) {
try {
FileSystem fileSystem = hdfsEnvironment.getFileSystem(session.getUser(), outputPath, jobConf);
DataSink orcDataSink = hdfsEnvironment.doAs(session.getUser(), () -> new OutputStreamDataSink(fileSystem.create(outputPath)));
Callable<Void> rollbackAction = () -> {
hdfsEnvironment.doAs(session.getUser(), () -> fileSystem.delete(outputPath, false));
return null;
};
List<Types.NestedField> columnFields = icebergSchema.columns();
List<String> fileColumnNames = columnFields.stream().map(Types.NestedField::name).collect(toImmutableList());
List<Type> fileColumnTypes = columnFields.stream().map(Types.NestedField::type).map(type -> toPrestoType(type, typeManager)).collect(toImmutableList());
Optional<Supplier<OrcDataSource>> validationInputFactory = Optional.empty();
if (isOrcOptimizedWriterValidate(session)) {
validationInputFactory = Optional.of(() -> {
try {
return new HdfsOrcDataSource(new OrcDataSourceId(outputPath.toString()), hdfsEnvironment.doAs(session.getUser(), () -> fileSystem.getFileStatus(outputPath).getLen()), getOrcMaxMergeDistance(session), getOrcMaxBufferSize(session), getOrcStreamBufferSize(session), false, hdfsEnvironment.doAs(session.getUser(), () -> fileSystem.open(outputPath)), readStats);
} catch (IOException e) {
throw new PrestoException(ICEBERG_WRITE_VALIDATION_FAILED, e);
}
});
}
return new IcebergOrcFileWriter(icebergSchema, orcDataSink, rollbackAction, ORC, fileColumnNames, fileColumnTypes, toOrcType(icebergSchema), getCompressionCodec(session).getOrcCompressionKind(), orcFileWriterConfig.toOrcWriterOptionsBuilder().withFlushPolicy(DefaultOrcWriterFlushPolicy.builder().withStripeMinSize(HiveSessionProperties.getOrcOptimizedWriterMinStripeSize(session)).withStripeMaxSize(HiveSessionProperties.getOrcOptimizedWriterMaxStripeSize(session)).withStripeMaxRowCount(HiveSessionProperties.getOrcOptimizedWriterMaxStripeRows(session)).build()).withDictionaryMaxMemory(HiveSessionProperties.getOrcOptimizedWriterMaxDictionaryMemory(session)).withMaxStringStatisticsLimit(HiveSessionProperties.getOrcStringStatisticsLimit(session)).build(), IntStream.range(0, fileColumnNames.size()).toArray(), ImmutableMap.<String, String>builder().put(PRESTO_VERSION_NAME, nodeVersion.toString()).put(PRESTO_QUERY_ID_NAME, session.getQueryId()).build(), UTC, validationInputFactory, getOrcOptimizedWriterValidateMode(session), orcWriterStats, dwrfEncryptionProvider, Optional.empty());
} catch (IOException e) {
throw new PrestoException(ICEBERG_WRITER_OPEN_ERROR, "Error creating ORC file", e);
}
}
use of com.facebook.presto.orc.OrcDataSourceId in project presto by prestodb.
the class OrcBatchPageSourceFactory method createOrcPageSource.
public static ConnectorPageSource createOrcPageSource(OrcEncoding orcEncoding, HdfsEnvironment hdfsEnvironment, String sessionUser, Configuration configuration, Path path, long start, long length, long fileSize, List<HiveColumnHandle> columns, boolean useOrcColumnNames, TupleDomain<HiveColumnHandle> effectivePredicate, DateTimeZone hiveStorageTimeZone, TypeManager typeManager, StandardFunctionResolution functionResolution, DataSize maxBufferSize, DataSize streamBufferSize, boolean lazyReadSmallRanges, boolean orcBloomFiltersEnabled, FileFormatDataSourceStats stats, int domainCompactionThreshold, OrcFileTailSource orcFileTailSource, StripeMetadataSourceFactory stripeMetadataSourceFactory, HiveFileContext hiveFileContext, OrcReaderOptions orcReaderOptions, Optional<EncryptionInformation> encryptionInformation, DwrfEncryptionProvider dwrfEncryptionProvider) {
checkArgument(domainCompactionThreshold >= 1, "domainCompactionThreshold must be at least 1");
OrcDataSource orcDataSource;
try {
FSDataInputStream inputStream = hdfsEnvironment.getFileSystem(sessionUser, path, configuration).openFile(path, hiveFileContext);
orcDataSource = new HdfsOrcDataSource(new OrcDataSourceId(path.toString()), fileSize, orcReaderOptions.getMaxMergeDistance(), maxBufferSize, streamBufferSize, lazyReadSmallRanges, inputStream, stats);
} catch (Exception e) {
if (nullToEmpty(e.getMessage()).trim().equals("Filesystem closed") || e instanceof FileNotFoundException) {
throw new PrestoException(HIVE_CANNOT_OPEN_SPLIT, e);
}
throw new PrestoException(HIVE_CANNOT_OPEN_SPLIT, splitError(e, path, start, length), e);
}
OrcAggregatedMemoryContext systemMemoryUsage = new HiveOrcAggregatedMemoryContext();
try {
DwrfKeyProvider dwrfKeyProvider = new ProjectionBasedDwrfKeyProvider(encryptionInformation, columns, useOrcColumnNames, path);
OrcReader reader = new OrcReader(orcDataSource, orcEncoding, orcFileTailSource, stripeMetadataSourceFactory, new HiveOrcAggregatedMemoryContext(), orcReaderOptions, hiveFileContext.isCacheable(), dwrfEncryptionProvider, dwrfKeyProvider, hiveFileContext.getStats());
List<HiveColumnHandle> physicalColumns = getPhysicalHiveColumnHandles(columns, useOrcColumnNames, reader.getTypes(), path);
ImmutableMap.Builder<Integer, Type> includedColumns = ImmutableMap.builder();
ImmutableList.Builder<ColumnReference<HiveColumnHandle>> columnReferences = ImmutableList.builder();
for (HiveColumnHandle column : physicalColumns) {
if (column.getColumnType() == REGULAR) {
Type type = typeManager.getType(column.getTypeSignature());
includedColumns.put(column.getHiveColumnIndex(), type);
columnReferences.add(new ColumnReference<>(column, column.getHiveColumnIndex(), type));
}
}
if (!physicalColumns.isEmpty() && physicalColumns.stream().allMatch(hiveColumnHandle -> hiveColumnHandle.getColumnType() == AGGREGATED)) {
return new AggregatedOrcPageSource(physicalColumns, reader.getFooter(), typeManager, functionResolution);
}
OrcPredicate predicate = new TupleDomainOrcPredicate<>(effectivePredicate, columnReferences.build(), orcBloomFiltersEnabled, Optional.of(domainCompactionThreshold));
OrcBatchRecordReader recordReader = reader.createBatchRecordReader(includedColumns.build(), predicate, start, length, hiveStorageTimeZone, systemMemoryUsage, INITIAL_BATCH_SIZE);
return new OrcBatchPageSource(recordReader, reader.getOrcDataSource(), physicalColumns, typeManager, systemMemoryUsage, stats, hiveFileContext.getStats());
} catch (Exception e) {
try {
orcDataSource.close();
} catch (IOException ignored) {
}
if (e instanceof PrestoException) {
throw (PrestoException) e;
}
String message = splitError(e, path, start, length);
if (e.getClass().getSimpleName().equals("BlockMissingException")) {
throw new PrestoException(HIVE_MISSING_DATA, message, e);
}
throw new PrestoException(HIVE_CANNOT_OPEN_SPLIT, message, e);
}
}
use of com.facebook.presto.orc.OrcDataSourceId in project presto by prestodb.
the class IcebergPageSourceProvider method createBatchOrcPageSource.
private static ConnectorPageSource createBatchOrcPageSource(HdfsEnvironment hdfsEnvironment, String user, Configuration configuration, Path path, long start, long length, boolean isCacheable, List<IcebergColumnHandle> regularColumns, TypeManager typeManager, TupleDomain<IcebergColumnHandle> effectivePredicate, OrcReaderOptions options, OrcEncoding orcEncoding, DataSize maxBufferSize, DataSize streamBufferSize, boolean lazyReadSmallRanges, boolean orcBloomFiltersEnabled, int domainCompactionThreshold, OrcFileTailSource orcFileTailSource, StripeMetadataSourceFactory stripeMetadataSourceFactory, FileFormatDataSourceStats stats, Optional<EncryptionInformation> encryptionInformation, DwrfEncryptionProvider dwrfEncryptionProvider) {
OrcDataSource orcDataSource = null;
try {
ExtendedFileSystem fileSystem = hdfsEnvironment.getFileSystem(user, path, configuration);
FileStatus fileStatus = fileSystem.getFileStatus(path);
long fileSize = fileStatus.getLen();
long modificationTime = fileStatus.getModificationTime();
HiveFileContext hiveFileContext = new HiveFileContext(true, NO_CACHE_CONSTRAINTS, Optional.empty(), Optional.of(fileSize), modificationTime, false);
FSDataInputStream inputStream = hdfsEnvironment.doAs(user, () -> fileSystem.openFile(path, hiveFileContext));
orcDataSource = new HdfsOrcDataSource(new OrcDataSourceId(path.toString()), fileSize, options.getMaxMergeDistance(), maxBufferSize, streamBufferSize, lazyReadSmallRanges, inputStream, stats);
// Todo: pass real columns to ProjectionBasedDwrfKeyProvider instead of ImmutableList.of()
DwrfKeyProvider dwrfKeyProvider = new ProjectionBasedDwrfKeyProvider(encryptionInformation, ImmutableList.of(), true, path);
RuntimeStats runtimeStats = new RuntimeStats();
OrcReader reader = new OrcReader(orcDataSource, orcEncoding, orcFileTailSource, stripeMetadataSourceFactory, new HiveOrcAggregatedMemoryContext(), options, isCacheable, dwrfEncryptionProvider, dwrfKeyProvider, runtimeStats);
List<HiveColumnHandle> physicalColumnHandles = new ArrayList<>(regularColumns.size());
ImmutableMap.Builder<Integer, Type> includedColumns = ImmutableMap.builder();
ImmutableList.Builder<TupleDomainOrcPredicate.ColumnReference<HiveColumnHandle>> columnReferences = ImmutableList.builder();
List<IcebergOrcColumn> fileOrcColumns = getFileOrcColumns(reader);
Map<Integer, IcebergOrcColumn> fileOrcColumnByIcebergId = fileOrcColumns.stream().filter(orcColumn -> orcColumn.getAttributes().containsKey(ORC_ICEBERG_ID_KEY)).collect(toImmutableMap(orcColumn -> Integer.parseInt(orcColumn.getAttributes().get(ORC_ICEBERG_ID_KEY)), orcColumn -> IcebergOrcColumn.copy(orcColumn).setIcebergColumnId(Optional.of(Integer.parseInt(orcColumn.getAttributes().get(ORC_ICEBERG_ID_KEY))))));
Map<String, IcebergOrcColumn> fileOrcColumnsByName = uniqueIndex(fileOrcColumns, orcColumn -> orcColumn.getColumnName().toLowerCase(ENGLISH));
int nextMissingColumnIndex = fileOrcColumnsByName.size();
for (IcebergColumnHandle column : regularColumns) {
IcebergOrcColumn icebergOrcColumn;
boolean isExcludeColumn = false;
if (fileOrcColumnByIcebergId.isEmpty()) {
icebergOrcColumn = fileOrcColumnsByName.get(column.getName());
} else {
icebergOrcColumn = fileOrcColumnByIcebergId.get(column.getId());
if (icebergOrcColumn == null) {
// Cannot get orc column from 'fileOrcColumnByIcebergId', which means SchemaEvolution may have happened, so we get orc column by column name.
icebergOrcColumn = fileOrcColumnsByName.get(column.getName());
if (icebergOrcColumn != null) {
isExcludeColumn = true;
}
}
}
if (icebergOrcColumn != null) {
HiveColumnHandle columnHandle = new HiveColumnHandle(// Todo: using orc file column name
column.getName(), toHiveType(column.getType()), column.getType().getTypeSignature(), icebergOrcColumn.getOrcColumnId(), icebergOrcColumn.getColumnType(), Optional.empty(), Optional.empty());
physicalColumnHandles.add(columnHandle);
// Skip SchemaEvolution column
if (!isExcludeColumn) {
includedColumns.put(columnHandle.getHiveColumnIndex(), typeManager.getType(columnHandle.getTypeSignature()));
columnReferences.add(new TupleDomainOrcPredicate.ColumnReference<>(columnHandle, columnHandle.getHiveColumnIndex(), typeManager.getType(columnHandle.getTypeSignature())));
}
} else {
physicalColumnHandles.add(new HiveColumnHandle(column.getName(), toHiveType(column.getType()), column.getType().getTypeSignature(), nextMissingColumnIndex++, REGULAR, Optional.empty(), Optional.empty()));
}
}
TupleDomain<HiveColumnHandle> hiveColumnHandleTupleDomain = effectivePredicate.transform(column -> {
IcebergOrcColumn icebergOrcColumn;
if (fileOrcColumnByIcebergId.isEmpty()) {
icebergOrcColumn = fileOrcColumnsByName.get(column.getName());
} else {
icebergOrcColumn = fileOrcColumnByIcebergId.get(column.getId());
if (icebergOrcColumn == null) {
// Cannot get orc column from 'fileOrcColumnByIcebergId', which means SchemaEvolution may have happened, so we get orc column by column name.
icebergOrcColumn = fileOrcColumnsByName.get(column.getName());
}
}
return new HiveColumnHandle(column.getName(), toHiveType(column.getType()), column.getType().getTypeSignature(), // Note: the HiveColumnHandle.hiveColumnIndex starts from '0' while the IcebergColumnHandle.id starts from '1'
icebergOrcColumn != null ? icebergOrcColumn.getOrcColumnId() : column.getId() - 1, icebergOrcColumn != null ? icebergOrcColumn.getColumnType() : REGULAR, Optional.empty(), Optional.empty());
});
OrcPredicate predicate = new TupleDomainOrcPredicate<>(hiveColumnHandleTupleDomain, columnReferences.build(), orcBloomFiltersEnabled, Optional.of(domainCompactionThreshold));
OrcAggregatedMemoryContext systemMemoryUsage = new HiveOrcAggregatedMemoryContext();
OrcBatchRecordReader recordReader = reader.createBatchRecordReader(includedColumns.build(), predicate, start, length, UTC, systemMemoryUsage, INITIAL_BATCH_SIZE);
return new OrcBatchPageSource(recordReader, orcDataSource, physicalColumnHandles, typeManager, systemMemoryUsage, stats, runtimeStats);
} catch (Exception e) {
if (orcDataSource != null) {
try {
orcDataSource.close();
} catch (IOException ignored) {
}
}
if (e instanceof PrestoException) {
throw (PrestoException) e;
}
String message = format("Error opening Iceberg split %s (offset=%s, length=%s): %s", path, start, length, e.getMessage());
if (e instanceof BlockMissingException) {
throw new PrestoException(ICEBERG_MISSING_DATA, message, e);
}
throw new PrestoException(ICEBERG_CANNOT_OPEN_SPLIT, message, e);
}
}
use of com.facebook.presto.orc.OrcDataSourceId in project presto by prestodb.
the class OrcFileWriterFactory method createFileWriter.
@Override
public Optional<HiveFileWriter> createFileWriter(Path path, List<String> inputColumnNames, StorageFormat storageFormat, Properties schema, JobConf configuration, ConnectorSession session, Optional<EncryptionInformation> encryptionInformation) {
if (!HiveSessionProperties.isOrcOptimizedWriterEnabled(session)) {
return Optional.empty();
}
OrcEncoding orcEncoding;
if (OrcOutputFormat.class.getName().equals(storageFormat.getOutputFormat())) {
orcEncoding = ORC;
} else if (com.facebook.hive.orc.OrcOutputFormat.class.getName().equals(storageFormat.getOutputFormat())) {
orcEncoding = DWRF;
} else {
return Optional.empty();
}
CompressionKind compression = getCompression(schema, configuration, orcEncoding);
// existing tables and partitions may have columns in a different order than the writer is providing, so build
// an index to rearrange columns in the proper order
List<String> fileColumnNames = Splitter.on(',').trimResults().omitEmptyStrings().splitToList(schema.getProperty(META_TABLE_COLUMNS, ""));
List<Type> fileColumnTypes = toHiveTypes(schema.getProperty(META_TABLE_COLUMN_TYPES, "")).stream().map(hiveType -> hiveType.getType(typeManager)).collect(toList());
int[] fileInputColumnIndexes = fileColumnNames.stream().mapToInt(inputColumnNames::indexOf).toArray();
try {
FileSystem fileSystem = hdfsEnvironment.getFileSystem(session.getUser(), path, configuration);
DataSink dataSink = createDataSink(session, fileSystem, path);
Optional<Supplier<OrcDataSource>> validationInputFactory = Optional.empty();
if (HiveSessionProperties.isOrcOptimizedWriterValidate(session)) {
validationInputFactory = Optional.of(() -> {
try {
return new HdfsOrcDataSource(new OrcDataSourceId(path.toString()), fileSystem.getFileStatus(path).getLen(), getOrcMaxMergeDistance(session), getOrcMaxBufferSize(session), getOrcStreamBufferSize(session), false, fileSystem.open(path), readStats);
} catch (IOException e) {
throw new PrestoException(HIVE_WRITE_VALIDATION_FAILED, e);
}
});
}
Callable<Void> rollbackAction = () -> {
fileSystem.delete(path, false);
return null;
};
Optional<DwrfWriterEncryption> dwrfWriterEncryption = createDwrfEncryption(encryptionInformation, fileColumnNames, fileColumnTypes);
return Optional.of(new OrcFileWriter(dataSink, rollbackAction, orcEncoding, fileColumnNames, fileColumnTypes, compression, orcFileWriterConfig.toOrcWriterOptionsBuilder().withFlushPolicy(DefaultOrcWriterFlushPolicy.builder().withStripeMinSize(getOrcOptimizedWriterMinStripeSize(session)).withStripeMaxSize(getOrcOptimizedWriterMaxStripeSize(session)).withStripeMaxRowCount(getOrcOptimizedWriterMaxStripeRows(session)).build()).withDictionaryMaxMemory(getOrcOptimizedWriterMaxDictionaryMemory(session)).withMaxStringStatisticsLimit(getOrcStringStatisticsLimit(session)).withIgnoreDictionaryRowGroupSizes(isExecutionBasedMemoryAccountingEnabled(session)).withDwrfStripeCacheEnabled(isDwrfWriterStripeCacheEnabled(session)).withDwrfStripeCacheMaxSize(getDwrfWriterStripeCacheeMaxSize(session)).build(), fileInputColumnIndexes, ImmutableMap.<String, String>builder().put(HiveMetadata.PRESTO_VERSION_NAME, nodeVersion.toString()).put(MetastoreUtil.PRESTO_QUERY_ID_NAME, session.getQueryId()).build(), hiveStorageTimeZone, validationInputFactory, getOrcOptimizedWriterValidateMode(session), stats, dwrfEncryptionProvider, dwrfWriterEncryption));
} catch (IOException e) {
throw new PrestoException(HIVE_WRITER_OPEN_ERROR, "Error creating " + orcEncoding + " file. " + e.getMessage(), e);
}
}
use of com.facebook.presto.orc.OrcDataSourceId in project presto by prestodb.
the class TestLongDecode method assertVIntRoundTrip.
private static void assertVIntRoundTrip(SliceOutput output, long value, boolean signed) throws IOException {
// write using Hive's code
output.reset();
if (signed) {
writeVslong(output, value);
} else {
writeVulong(output, value);
}
Slice hiveBytes = Slices.copyOf(output.slice());
// write using Presto's code, and verify they are the same
output.reset();
writeVLong(output, value, signed);
Slice prestoBytes = Slices.copyOf(output.slice());
if (!prestoBytes.equals(hiveBytes)) {
assertEquals(prestoBytes, hiveBytes);
}
// read using Hive's code
if (signed) {
long readValueOld = readVslong(hiveBytes.getInput());
assertEquals(readValueOld, value);
} else {
long readValueOld = readVulong(hiveBytes.getInput());
assertEquals(readValueOld, value);
}
// read using Presto's code
TestingHiveOrcAggregatedMemoryContext aggregatedMemoryContext = new TestingHiveOrcAggregatedMemoryContext();
long readValueNew = readVInt(signed, new OrcInputStream(new OrcDataSourceId("test"), new SharedBuffer(aggregatedMemoryContext.newOrcLocalMemoryContext("sharedDecompressionBuffer")), hiveBytes.getInput(), Optional.empty(), Optional.empty(), aggregatedMemoryContext, hiveBytes.getRetainedSize()));
assertEquals(readValueNew, value);
}
Aggregations