use of io.trino.orc.OrcDataSource in project trino by trinodb.
the class OrcFileWriter method commit.
@Override
public void commit() {
try {
if (transaction.isAcidTransactionRunning() && useAcidSchema) {
updateUserMetadata();
}
orcWriter.close();
} catch (IOException | UncheckedIOException e) {
try {
rollbackAction.call();
} catch (Exception ignored) {
// ignore
}
throw new TrinoException(HIVE_WRITER_CLOSE_ERROR, "Error committing write to Hive", e);
}
if (validationInputFactory.isPresent()) {
try {
try (OrcDataSource input = validationInputFactory.get().get()) {
long startThreadCpuTime = THREAD_MX_BEAN.getCurrentThreadCpuTime();
orcWriter.validate(input);
validationCpuNanos += THREAD_MX_BEAN.getCurrentThreadCpuTime() - startThreadCpuTime;
}
} catch (IOException | UncheckedIOException e) {
throw new TrinoException(HIVE_WRITE_VALIDATION_FAILED, e);
}
}
}
use of io.trino.orc.OrcDataSource in project trino by trinodb.
the class OrcPageSourceFactory method createOrcPageSource.
private ConnectorPageSource createOrcPageSource(HdfsEnvironment hdfsEnvironment, ConnectorIdentity identity, Configuration configuration, Path path, long start, long length, long estimatedFileSize, List<HiveColumnHandle> columns, List<HiveColumnHandle> projections, boolean useOrcColumnNames, boolean isFullAcid, TupleDomain<HiveColumnHandle> effectivePredicate, DateTimeZone legacyFileTimeZone, OrcReaderOptions options, Optional<AcidInfo> acidInfo, OptionalInt bucketNumber, boolean originalFile, AcidTransaction transaction, FileFormatDataSourceStats stats) {
for (HiveColumnHandle column : columns) {
checkArgument(column.getColumnType() == REGULAR, "column type must be regular: %s", column);
}
checkArgument(!effectivePredicate.isNone());
OrcDataSource orcDataSource;
boolean originalFilesPresent = acidInfo.isPresent() && !acidInfo.get().getOriginalFiles().isEmpty();
try {
FileSystem fileSystem = hdfsEnvironment.getFileSystem(identity, path, configuration);
FSDataInputStream inputStream = hdfsEnvironment.doAs(identity, () -> fileSystem.open(path));
orcDataSource = new HdfsOrcDataSource(new OrcDataSourceId(path.toString()), estimatedFileSize, options, inputStream, stats);
} catch (Exception e) {
if (nullToEmpty(e.getMessage()).trim().equals("Filesystem closed") || e instanceof FileNotFoundException) {
throw new TrinoException(HIVE_CANNOT_OPEN_SPLIT, e);
}
throw new TrinoException(HIVE_CANNOT_OPEN_SPLIT, splitError(e, path, start, length), e);
}
AggregatedMemoryContext memoryUsage = newSimpleAggregatedMemoryContext();
try {
Optional<OrcReader> optionalOrcReader = OrcReader.createOrcReader(orcDataSource, options);
if (optionalOrcReader.isEmpty()) {
return new EmptyPageSource();
}
OrcReader reader = optionalOrcReader.get();
if (!originalFile && acidInfo.isPresent() && !acidInfo.get().isOrcAcidVersionValidated()) {
validateOrcAcidVersion(path, reader);
}
List<OrcColumn> fileColumns = reader.getRootColumn().getNestedColumns();
int actualColumnCount = columns.size() + (isFullAcid ? 3 : 0);
List<OrcColumn> fileReadColumns = new ArrayList<>(actualColumnCount);
List<Type> fileReadTypes = new ArrayList<>(actualColumnCount);
List<OrcReader.ProjectedLayout> fileReadLayouts = new ArrayList<>(actualColumnCount);
if (isFullAcid && !originalFilesPresent) {
verifyAcidSchema(reader, path);
Map<String, OrcColumn> acidColumnsByName = uniqueIndex(fileColumns, orcColumn -> orcColumn.getColumnName().toLowerCase(ENGLISH));
fileColumns = ensureColumnNameConsistency(acidColumnsByName.get(AcidSchema.ACID_COLUMN_ROW_STRUCT.toLowerCase(ENGLISH)).getNestedColumns(), columns);
fileReadColumns.add(acidColumnsByName.get(AcidSchema.ACID_COLUMN_ORIGINAL_TRANSACTION.toLowerCase(ENGLISH)));
fileReadTypes.add(BIGINT);
fileReadLayouts.add(fullyProjectedLayout());
fileReadColumns.add(acidColumnsByName.get(AcidSchema.ACID_COLUMN_BUCKET.toLowerCase(ENGLISH)));
fileReadTypes.add(INTEGER);
fileReadLayouts.add(fullyProjectedLayout());
fileReadColumns.add(acidColumnsByName.get(AcidSchema.ACID_COLUMN_ROW_ID.toLowerCase(ENGLISH)));
fileReadTypes.add(BIGINT);
fileReadLayouts.add(fullyProjectedLayout());
}
Map<String, OrcColumn> fileColumnsByName = ImmutableMap.of();
if (useOrcColumnNames || isFullAcid) {
verifyFileHasColumnNames(fileColumns, path);
// Convert column names read from ORC files to lower case to be consistent with those stored in Hive Metastore
fileColumnsByName = uniqueIndex(fileColumns, orcColumn -> orcColumn.getColumnName().toLowerCase(ENGLISH));
}
Map<String, List<List<String>>> projectionsByColumnName = ImmutableMap.of();
Map<Integer, List<List<String>>> projectionsByColumnIndex = ImmutableMap.of();
if (useOrcColumnNames || isFullAcid) {
projectionsByColumnName = projections.stream().collect(Collectors.groupingBy(HiveColumnHandle::getBaseColumnName, mapping(OrcPageSourceFactory::getDereferencesAsList, toList())));
} else {
projectionsByColumnIndex = projections.stream().collect(Collectors.groupingBy(HiveColumnHandle::getBaseHiveColumnIndex, mapping(OrcPageSourceFactory::getDereferencesAsList, toList())));
}
TupleDomainOrcPredicateBuilder predicateBuilder = TupleDomainOrcPredicate.builder().setBloomFiltersEnabled(options.isBloomFiltersEnabled()).setDomainCompactionThreshold(domainCompactionThreshold);
Map<HiveColumnHandle, Domain> effectivePredicateDomains = effectivePredicate.getDomains().orElseThrow(() -> new IllegalArgumentException("Effective predicate is none"));
List<ColumnAdaptation> columnAdaptations = new ArrayList<>(columns.size());
for (HiveColumnHandle column : columns) {
OrcColumn orcColumn = null;
OrcReader.ProjectedLayout projectedLayout = null;
Map<Optional<HiveColumnProjectionInfo>, Domain> columnDomains = null;
if (useOrcColumnNames || isFullAcid) {
String columnName = column.getName().toLowerCase(ENGLISH);
orcColumn = fileColumnsByName.get(columnName);
if (orcColumn != null) {
projectedLayout = createProjectedLayout(orcColumn, projectionsByColumnName.get(columnName));
columnDomains = effectivePredicateDomains.entrySet().stream().filter(columnDomain -> columnDomain.getKey().getBaseColumnName().toLowerCase(ENGLISH).equals(columnName)).collect(toImmutableMap(columnDomain -> columnDomain.getKey().getHiveColumnProjectionInfo(), Map.Entry::getValue));
}
} else if (column.getBaseHiveColumnIndex() < fileColumns.size()) {
orcColumn = fileColumns.get(column.getBaseHiveColumnIndex());
if (orcColumn != null) {
projectedLayout = createProjectedLayout(orcColumn, projectionsByColumnIndex.get(column.getBaseHiveColumnIndex()));
columnDomains = effectivePredicateDomains.entrySet().stream().filter(columnDomain -> columnDomain.getKey().getBaseHiveColumnIndex() == column.getBaseHiveColumnIndex()).collect(toImmutableMap(columnDomain -> columnDomain.getKey().getHiveColumnProjectionInfo(), Map.Entry::getValue));
}
}
Type readType = column.getType();
if (orcColumn != null) {
int sourceIndex = fileReadColumns.size();
columnAdaptations.add(ColumnAdaptation.sourceColumn(sourceIndex));
fileReadColumns.add(orcColumn);
fileReadTypes.add(readType);
fileReadLayouts.add(projectedLayout);
// Add predicates on top-level and nested columns
for (Map.Entry<Optional<HiveColumnProjectionInfo>, Domain> columnDomain : columnDomains.entrySet()) {
OrcColumn nestedColumn = getNestedColumn(orcColumn, columnDomain.getKey());
if (nestedColumn != null) {
predicateBuilder.addColumn(nestedColumn.getColumnId(), columnDomain.getValue());
}
}
} else {
columnAdaptations.add(ColumnAdaptation.nullColumn(readType));
}
}
OrcRecordReader recordReader = reader.createRecordReader(fileReadColumns, fileReadTypes, fileReadLayouts, predicateBuilder.build(), start, length, legacyFileTimeZone, memoryUsage, INITIAL_BATCH_SIZE, exception -> handleException(orcDataSource.getId(), exception), NameBasedFieldMapper::create);
Optional<OrcDeletedRows> deletedRows = acidInfo.map(info -> new OrcDeletedRows(path.getName(), new OrcDeleteDeltaPageSourceFactory(options, identity, configuration, hdfsEnvironment, stats), identity, configuration, hdfsEnvironment, info, bucketNumber, memoryUsage));
Optional<Long> originalFileRowId = acidInfo.filter(OrcPageSourceFactory::hasOriginalFiles).map(info -> OriginalFilesUtils.getPrecedingRowCount(acidInfo.get().getOriginalFiles(), path, hdfsEnvironment, identity, options, configuration, stats));
if (transaction.isDelete()) {
if (originalFile) {
int bucket = bucketNumber.orElse(0);
long startingRowId = originalFileRowId.orElse(0L);
columnAdaptations.add(ColumnAdaptation.originalFileRowIdColumn(startingRowId, bucket));
} else {
columnAdaptations.add(ColumnAdaptation.rowIdColumn());
}
} else if (transaction.isUpdate()) {
HiveUpdateProcessor updateProcessor = transaction.getUpdateProcessor().orElseThrow(() -> new IllegalArgumentException("updateProcessor not present"));
List<HiveColumnHandle> dependencyColumns = projections.stream().filter(HiveColumnHandle::isBaseColumn).collect(toImmutableList());
if (originalFile) {
int bucket = bucketNumber.orElse(0);
long startingRowId = originalFileRowId.orElse(0L);
columnAdaptations.add(updatedRowColumnsWithOriginalFiles(startingRowId, bucket, updateProcessor, dependencyColumns));
} else {
columnAdaptations.add(updatedRowColumns(updateProcessor, dependencyColumns));
}
}
return new OrcPageSource(recordReader, columnAdaptations, orcDataSource, deletedRows, originalFileRowId, memoryUsage, stats);
} catch (Exception e) {
try {
orcDataSource.close();
} catch (IOException ignored) {
}
if (e instanceof TrinoException) {
throw (TrinoException) e;
}
String message = splitError(e, path, start, length);
if (e instanceof BlockMissingException) {
throw new TrinoException(HIVE_MISSING_DATA, message, e);
}
throw new TrinoException(HIVE_CANNOT_OPEN_SPLIT, message, e);
}
}
use of io.trino.orc.OrcDataSource in project trino by trinodb.
the class OriginalFilesUtils method getRowsInFile.
/**
* Returns number of rows present in the file, based on the ORC footer.
*/
private static Long getRowsInFile(Path splitPath, HdfsEnvironment hdfsEnvironment, ConnectorIdentity identity, OrcReaderOptions options, Configuration configuration, FileFormatDataSourceStats stats, long fileSize) {
try {
FileSystem fileSystem = hdfsEnvironment.getFileSystem(identity, splitPath, configuration);
FSDataInputStream inputStream = hdfsEnvironment.doAs(identity, () -> fileSystem.open(splitPath));
try (OrcDataSource orcDataSource = new HdfsOrcDataSource(new OrcDataSourceId(splitPath.toString()), fileSize, options, inputStream, stats)) {
OrcReader reader = createOrcReader(orcDataSource, options).orElseThrow(() -> new TrinoException(HIVE_CANNOT_OPEN_SPLIT, "Could not read ORC footer from empty file: " + splitPath));
return reader.getFooter().getNumberOfRows();
}
} catch (TrinoException e) {
throw e;
} catch (Exception e) {
throw new TrinoException(HIVE_CANNOT_OPEN_SPLIT, "Could not read ORC footer from file: " + splitPath, e);
}
}
use of io.trino.orc.OrcDataSource in project trino by trinodb.
the class IcebergFileWriterFactory method createOrcWriter.
private IcebergFileWriter createOrcWriter(MetricsConfig metricsConfig, Path outputPath, Schema icebergSchema, JobConf jobConf, ConnectorSession session) {
try {
FileSystem fileSystem = hdfsEnvironment.getFileSystem(session.getIdentity(), outputPath, jobConf);
OrcDataSink orcDataSink = hdfsEnvironment.doAs(session.getIdentity(), () -> new OutputStreamOrcDataSink(fileSystem.create(outputPath)));
Callable<Void> rollbackAction = () -> {
hdfsEnvironment.doAs(session.getIdentity(), () -> fileSystem.delete(outputPath, false));
return null;
};
List<Types.NestedField> columnFields = icebergSchema.columns();
List<String> fileColumnNames = columnFields.stream().map(Types.NestedField::name).collect(toImmutableList());
List<Type> fileColumnTypes = columnFields.stream().map(Types.NestedField::type).map(type -> toTrinoType(type, typeManager)).collect(toImmutableList());
Optional<Supplier<OrcDataSource>> validationInputFactory = Optional.empty();
if (isOrcWriterValidate(session)) {
validationInputFactory = Optional.of(() -> {
try {
return new HdfsOrcDataSource(new OrcDataSourceId(outputPath.toString()), hdfsEnvironment.doAs(session.getIdentity(), () -> fileSystem.getFileStatus(outputPath).getLen()), new OrcReaderOptions(), hdfsEnvironment.doAs(session.getIdentity(), () -> fileSystem.open(outputPath)), readStats);
} catch (IOException e) {
throw new TrinoException(ICEBERG_WRITE_VALIDATION_FAILED, e);
}
});
}
return new IcebergOrcFileWriter(metricsConfig, icebergSchema, orcDataSink, rollbackAction, fileColumnNames, fileColumnTypes, toOrcType(icebergSchema), getCompressionCodec(session).getOrcCompressionKind(), orcWriterOptions.withStripeMinSize(getOrcWriterMinStripeSize(session)).withStripeMaxSize(getOrcWriterMaxStripeSize(session)).withStripeMaxRowCount(getOrcWriterMaxStripeRows(session)).withDictionaryMaxMemory(getOrcWriterMaxDictionaryMemory(session)).withMaxStringStatisticsLimit(getOrcStringStatisticsLimit(session)), IntStream.range(0, fileColumnNames.size()).toArray(), ImmutableMap.<String, String>builder().put(PRESTO_VERSION_NAME, nodeVersion.toString()).put(PRESTO_QUERY_ID_NAME, session.getQueryId()).buildOrThrow(), validationInputFactory, getOrcWriterValidateMode(session), orcWriterStats);
} catch (IOException e) {
throw new TrinoException(ICEBERG_WRITER_OPEN_ERROR, "Error creating ORC file", e);
}
}
use of io.trino.orc.OrcDataSource in project trino by trinodb.
the class RaptorStorageManager method computeShardStats.
private List<ColumnStats> computeShardStats(File file) {
try (OrcDataSource dataSource = fileOrcDataSource(orcReaderOptions, file)) {
OrcReader reader = OrcReader.createOrcReader(dataSource, orcReaderOptions).orElseThrow(() -> new TrinoException(RAPTOR_ERROR, "Data file is empty: " + file));
ImmutableList.Builder<ColumnStats> list = ImmutableList.builder();
for (ColumnInfo info : getColumnInfo(reader)) {
computeColumnStats(reader, info.getColumnId(), info.getType(), typeManager).ifPresent(list::add);
}
return list.build();
} catch (IOException e) {
throw new TrinoException(RAPTOR_ERROR, "Failed to read file: " + file, e);
}
}
Aggregations