use of io.prestosql.orc.OrcDataSource in project hetu-core by openlookeng.
the class OrcFileWriterFactory method createFileWriter.
@Override
public Optional<HiveFileWriter> createFileWriter(Path path, List<String> inputColumnNames, StorageFormat storageFormat, Properties schema, JobConf configuration, ConnectorSession session, Optional<AcidOutputFormat.Options> acidOptions, Optional<HiveACIDWriteType> acidWriteType) {
if (!OrcOutputFormat.class.getName().equals(storageFormat.getOutputFormat())) {
return Optional.empty();
}
CompressionKind compression = getCompression(schema, configuration);
// existing tables and partitions may have columns in a different order than the writer is providing, so build
// an index to rearrange columns in the proper order
List<String> fileColumnNames = getColumnNames(schema);
List<Type> fileColumnTypes = getColumnTypes(schema).stream().map(hiveType -> hiveType.getType(typeManager)).collect(toList());
List<Type> dataFileColumnTypes = fileColumnTypes;
int[] fileInputColumnIndexes = fileColumnNames.stream().mapToInt(inputColumnNames::indexOf).toArray();
Optional<HiveFileWriter> deleteDeltaWriter = Optional.empty();
if (AcidUtils.isTablePropertyTransactional(schema) && !AcidUtils.isInsertOnlyTable(schema)) {
ImmutableList<String> orcFileColumnNames = ImmutableList.of(OrcPageSourceFactory.ACID_COLUMN_OPERATION, OrcPageSourceFactory.ACID_COLUMN_ORIGINAL_TRANSACTION, OrcPageSourceFactory.ACID_COLUMN_BUCKET, OrcPageSourceFactory.ACID_COLUMN_ROW_ID, OrcPageSourceFactory.ACID_COLUMN_CURRENT_TRANSACTION, OrcPageSourceFactory.ACID_COLUMN_ROW_STRUCT);
ImmutableList.Builder<RowType.Field> fieldsBuilder = ImmutableList.builder();
for (int i = 0; i < fileColumnNames.size(); i++) {
fieldsBuilder.add(new RowType.Field(Optional.of(fileColumnNames.get(i)), fileColumnTypes.get(i)));
}
ImmutableList<Type> orcFileColumnTypes = ImmutableList.of(INTEGER, BIGINT, INTEGER, BIGINT, BIGINT, RowType.from(fieldsBuilder.build()));
fileColumnNames = orcFileColumnNames;
fileColumnTypes = orcFileColumnTypes;
if (acidWriteType.isPresent() && acidWriteType.get() == HiveACIDWriteType.UPDATE) {
AcidOutputFormat.Options deleteOptions = acidOptions.get().clone().writingDeleteDelta(true);
Path deleteDeltaPath = AcidUtils.createFilename(path.getParent().getParent(), deleteOptions);
deleteDeltaWriter = createFileWriter(deleteDeltaPath, inputColumnNames, storageFormat, schema, configuration, session, Optional.of(deleteOptions), Optional.of(HiveACIDWriteType.DELETE));
}
}
try {
FileSystem fileSystem = hdfsEnvironment.getFileSystem(session.getUser(), path, configuration);
OrcDataSink orcDataSink = createOrcDataSink(session, fileSystem, path);
Optional<Supplier<OrcDataSource>> validationInputFactory = Optional.empty();
if (HiveSessionProperties.isOrcOptimizedWriterValidate(session)) {
validationInputFactory = Optional.of(() -> {
try {
FileStatus fileStatus = fileSystem.getFileStatus(path);
return new HdfsOrcDataSource(new OrcDataSourceId(path.toString()), fileStatus.getLen(), HiveSessionProperties.getOrcMaxMergeDistance(session), HiveSessionProperties.getOrcMaxBufferSize(session), HiveSessionProperties.getOrcStreamBufferSize(session), false, fileSystem.open(path), readStats, fileStatus.getModificationTime());
} catch (IOException e) {
throw new PrestoException(HiveErrorCode.HIVE_WRITE_VALIDATION_FAILED, e);
}
});
}
Callable<Void> rollbackAction = () -> {
fileSystem.delete(path, false);
return null;
};
return Optional.of(new OrcFileWriter(orcDataSink, rollbackAction, fileColumnNames, fileColumnTypes, dataFileColumnTypes, compression, orcWriterOptions.withStripeMinSize(HiveSessionProperties.getOrcOptimizedWriterMinStripeSize(session)).withStripeMaxSize(HiveSessionProperties.getOrcOptimizedWriterMaxStripeSize(session)).withStripeMaxRowCount(HiveSessionProperties.getOrcOptimizedWriterMaxStripeRows(session)).withDictionaryMaxMemory(HiveSessionProperties.getOrcOptimizedWriterMaxDictionaryMemory(session)).withMaxStringStatisticsLimit(HiveSessionProperties.getOrcStringStatisticsLimit(session)), writeLegacyVersion, fileInputColumnIndexes, ImmutableMap.<String, String>builder().put(HiveMetadata.PRESTO_VERSION_NAME, nodeVersion.toString()).put(HiveMetadata.PRESTO_QUERY_ID_NAME, session.getQueryId()).put("hive.acid.version", String.valueOf(AcidUtils.OrcAcidVersion.ORC_ACID_VERSION)).build(), validationInputFactory, HiveSessionProperties.getOrcOptimizedWriterValidateMode(session), stats, acidOptions, acidWriteType, deleteDeltaWriter, path));
} catch (IOException e) {
throw new PrestoException(HiveErrorCode.HIVE_WRITER_OPEN_ERROR, "Error creating ORC file", e);
}
}
use of io.prestosql.orc.OrcDataSource in project hetu-core by openlookeng.
the class OrcFileWriter method commit.
@Override
public void commit() {
try {
if (deleteDeltaFileWriter.isPresent()) {
deleteDeltaFileWriter.get().commit();
}
orcWriter.close();
} catch (IOException | UncheckedIOException e) {
try {
rollbackAction.call();
} catch (Exception ignored) {
log.warn("RollbackAction error after roc commit error");
}
throw new PrestoException(HIVE_WRITER_CLOSE_ERROR, "Error committing write to Hive", e);
}
if (validationInputFactory.isPresent()) {
try {
try (OrcDataSource input = validationInputFactory.get().get()) {
long startThreadCpuTime = THREAD_MX_BEAN.getCurrentThreadCpuTime();
orcWriter.validate(input);
validationCpuNanos += THREAD_MX_BEAN.getCurrentThreadCpuTime() - startThreadCpuTime;
}
} catch (IOException | UncheckedIOException e) {
throw new PrestoException(HIVE_WRITE_VALIDATION_FAILED, e);
}
}
}
use of io.prestosql.orc.OrcDataSource in project hetu-core by openlookeng.
the class SortingFileWriter method mergeFiles.
private void mergeFiles(Iterable<TempFile> files, Consumer<Page> consumer) {
try (Closer closer = Closer.create()) {
Collection<Iterator<Page>> iterators = new ArrayList<>();
for (TempFile tempFile : files) {
Path file = tempFile.getPath();
FileStatus fileStatus = fileSystem.getFileStatus(file);
OrcDataSource dataSource = new HdfsOrcDataSource(new OrcDataSourceId(file.toString()), fileStatus.getLen(), new DataSize(1, MEGABYTE), new DataSize(8, MEGABYTE), new DataSize(8, MEGABYTE), false, fileSystem.open(file), new FileFormatDataSourceStats(), fileStatus.getModificationTime());
TempFileReader reader = new TempFileReader(types, dataSource);
// Closing the reader also closes the data source
closer.register(reader);
iterators.add(reader);
}
new MergingPageIterator(iterators, types, sortFields, sortOrders).forEachRemaining(consumer);
for (TempFile tempFile : files) {
Path file = tempFile.getPath();
fileSystem.delete(file, false);
if (fileSystem.exists(file)) {
throw new IOException("Failed to delete temporary file: " + file);
}
}
} catch (IOException e) {
throw new UncheckedIOException(e);
}
}
use of io.prestosql.orc.OrcDataSource in project hetu-core by openlookeng.
the class OrcPageSourceFactory method createOrcPageSource.
public static OrcPageSource createOrcPageSource(HdfsEnvironment hdfsEnvironment, String sessionUser, Configuration configuration, Path path, long start, long length, long fileSize, List<HiveColumnHandle> columns, boolean useOrcColumnNames, boolean isFullAcid, TupleDomain<HiveColumnHandle> effectivePredicate, DateTimeZone legacyFileTimeZone, TypeManager typeManager, DataSize maxMergeDistance, DataSize maxBufferSize, DataSize streamBufferSize, DataSize tinyStripeThreshold, DataSize maxReadBlockSize, boolean lazyReadSmallRanges, boolean orcBloomFiltersEnabled, FileFormatDataSourceStats stats, Optional<DynamicFilterSupplier> dynamicFilters, Optional<DeleteDeltaLocations> deleteDeltaLocations, Optional<Long> startRowOffsetOfFile, Optional<List<IndexMetadata>> indexes, SplitMetadata splitMetadata, OrcCacheStore orcCacheStore, OrcCacheProperties orcCacheProperties, int domainCompactionThreshold, boolean pageMetadataEnabled, long dataSourceLastModifiedTime) {
for (HiveColumnHandle column : columns) {
checkArgument(column.getColumnType() == HiveColumnHandle.ColumnType.REGULAR || column.getHiveColumnIndex() == HiveColumnHandle.ROW_ID__COLUMN_INDEX, "column type must be regular: %s", column);
}
checkArgument(!effectivePredicate.isNone());
OrcDataSource orcDataSource;
try {
// Always create a lazy Stream. HDFS stream opened only when required.
FSDataInputStream inputStream = new FSDataInputStream(new LazyFSInputStream(() -> {
FileSystem fileSystem = hdfsEnvironment.getFileSystem(sessionUser, path, configuration);
return hdfsEnvironment.doAs(sessionUser, () -> fileSystem.open(path));
}));
orcDataSource = new HdfsOrcDataSource(new OrcDataSourceId(path.toString()), fileSize, maxMergeDistance, maxBufferSize, streamBufferSize, lazyReadSmallRanges, inputStream, stats, dataSourceLastModifiedTime);
} catch (Exception e) {
if (nullToEmpty(e.getMessage()).trim().equals("Filesystem closed") || e instanceof FileNotFoundException) {
throw new PrestoException(HIVE_CANNOT_OPEN_SPLIT, e);
}
throw new PrestoException(HIVE_CANNOT_OPEN_SPLIT, splitError(e, path, start, length), e);
}
AggregatedMemoryContext systemMemoryUsage = newSimpleAggregatedMemoryContext();
try {
OrcDataSource readerLocalDataSource = OrcReader.wrapWithCacheIfTiny(orcDataSource, tinyStripeThreshold);
OrcFileTail fileTail;
if (orcCacheProperties.isFileTailCacheEnabled()) {
try {
OrcDataSourceIdWithTimeStamp orcDataSourceIdWithTimeStamp = new OrcDataSourceIdWithTimeStamp(readerLocalDataSource.getId(), readerLocalDataSource.getLastModifiedTime());
fileTail = orcCacheStore.getFileTailCache().get(new OrcFileTailCacheKey(orcDataSourceIdWithTimeStamp), () -> OrcPageSourceFactory.createFileTail(orcDataSource));
} catch (UncheckedExecutionException | ExecutionException executionException) {
handleCacheLoadException(executionException);
log.debug(executionException.getCause(), "Error while caching the Orc file tail. Falling back to default flow");
fileTail = OrcPageSourceFactory.createFileTail(orcDataSource);
}
} else {
fileTail = OrcPageSourceFactory.createFileTail(orcDataSource);
}
OrcReader reader = new OrcReader(readerLocalDataSource, fileTail, maxMergeDistance, tinyStripeThreshold, maxReadBlockSize);
List<OrcColumn> fileColumns = reader.getRootColumn().getNestedColumns();
List<OrcColumn> fileReadColumns = isFullAcid ? new ArrayList<>(columns.size() + 5) : new ArrayList<>(columns.size());
List<Type> fileReadTypes = isFullAcid ? new ArrayList<>(columns.size() + 5) : new ArrayList<>(columns.size());
ImmutableList<String> acidColumnNames = null;
List<ColumnAdaptation> columnAdaptations = new ArrayList<>(columns.size());
// Only Hive ACID files will begin with bucket_
boolean fileNameContainsBucket = path.getName().contains("bucket");
if (isFullAcid && fileNameContainsBucket) {
// Skip the acid schema check in case of non-ACID files
acidColumnNames = ImmutableList.<String>builder().add(ACID_COLUMN_ORIGINAL_TRANSACTION, ACID_COLUMN_BUCKET, ACID_COLUMN_ROW_ID, ACID_COLUMN_CURRENT_TRANSACTION, ACID_COLUMN_OPERATION).build();
verifyAcidSchema(reader, path);
Map<String, OrcColumn> acidColumnsByName = uniqueIndex(fileColumns, orcColumn -> orcColumn.getColumnName().toLowerCase(ENGLISH));
if (AcidUtils.isDeleteDelta(path.getParent())) {
// Avoid reading column data from delete_delta files.
// Call will come here in case of Minor VACUUM where all delete_delta files are merge together.
fileColumns = ImmutableList.of();
} else {
fileColumns = ensureColumnNameConsistency(acidColumnsByName.get(ACID_COLUMN_ROW_STRUCT).getNestedColumns(), columns);
}
fileReadColumns.add(acidColumnsByName.get(ACID_COLUMN_ORIGINAL_TRANSACTION.toLowerCase(ENGLISH)));
fileReadTypes.add(BIGINT);
fileReadColumns.add(acidColumnsByName.get(ACID_COLUMN_BUCKET.toLowerCase(ENGLISH)));
fileReadTypes.add(INTEGER);
fileReadColumns.add(acidColumnsByName.get(ACID_COLUMN_ROW_ID.toLowerCase(ENGLISH)));
fileReadTypes.add(BIGINT);
fileReadColumns.add(acidColumnsByName.get(ACID_COLUMN_CURRENT_TRANSACTION.toLowerCase(ENGLISH)));
fileReadTypes.add(BIGINT);
fileReadColumns.add(acidColumnsByName.get(ACID_COLUMN_OPERATION.toLowerCase(ENGLISH)));
fileReadTypes.add(INTEGER);
}
Map<String, OrcColumn> fileColumnsByName = ImmutableMap.of();
if (useOrcColumnNames || isFullAcid) {
verifyFileHasColumnNames(fileColumns, path);
// Convert column names read from ORC files to lower case to be consistent with those stored in Hive Metastore
fileColumnsByName = uniqueIndex(fileColumns, orcColumn -> orcColumn.getColumnName().toLowerCase(ENGLISH));
}
TupleDomainOrcPredicateBuilder predicateBuilder = TupleDomainOrcPredicate.builder().setBloomFiltersEnabled(orcBloomFiltersEnabled);
Map<HiveColumnHandle, Domain> effectivePredicateDomains = effectivePredicate.getDomains().orElseThrow(() -> new IllegalArgumentException("Effective predicate is none"));
for (HiveColumnHandle column : columns) {
OrcColumn orcColumn = null;
if (useOrcColumnNames || isFullAcid) {
orcColumn = fileColumnsByName.get(column.getName());
} else if (column.getHiveColumnIndex() >= 0 && column.getHiveColumnIndex() < fileColumns.size()) {
orcColumn = fileColumns.get(column.getHiveColumnIndex());
}
Type readType = typeManager.getType(column.getTypeSignature());
if (orcColumn != null) {
int sourceIndex = fileReadColumns.size();
columnAdaptations.add(ColumnAdaptation.sourceColumn(sourceIndex));
fileReadColumns.add(orcColumn);
fileReadTypes.add(readType);
Domain domain = effectivePredicateDomains.get(column);
if (domain != null) {
predicateBuilder.addColumn(orcColumn.getColumnId(), domain);
}
} else if (isFullAcid && readType instanceof RowType && column.getName().equalsIgnoreCase(HiveColumnHandle.UPDATE_ROW_ID_COLUMN_NAME)) {
HiveType hiveType = column.getHiveType();
StructTypeInfo structTypeInfo = (StructTypeInfo) hiveType.getTypeInfo();
ImmutableList.Builder<ColumnAdaptation> builder = new ImmutableList.Builder<>();
ArrayList<String> fieldNames = structTypeInfo.getAllStructFieldNames();
List<ColumnAdaptation> adaptations = fieldNames.stream().map(acidColumnNames::indexOf).map(c -> ColumnAdaptation.sourceColumn(c, false)).collect(Collectors.toList());
columnAdaptations.add(ColumnAdaptation.structColumn(structTypeInfo, adaptations));
} else {
columnAdaptations.add(ColumnAdaptation.nullColumn(readType));
}
}
Map<String, Domain> domains = effectivePredicate.getDomains().get().entrySet().stream().collect(toMap(e -> e.getKey().getName(), Map.Entry::getValue));
OrcRecordReader recordReader = reader.createRecordReader(fileReadColumns, fileReadTypes, predicateBuilder.build(), start, length, legacyFileTimeZone, systemMemoryUsage, INITIAL_BATCH_SIZE, exception -> handleException(orcDataSource.getId(), exception), indexes, splitMetadata, domains, orcCacheStore, orcCacheProperties, pageMetadataEnabled);
OrcDeletedRows deletedRows = new OrcDeletedRows(path.getName(), deleteDeltaLocations, new OrcDeleteDeltaPageSourceFactory(sessionUser, configuration, hdfsEnvironment, maxMergeDistance, maxBufferSize, streamBufferSize, maxReadBlockSize, tinyStripeThreshold, lazyReadSmallRanges, orcBloomFiltersEnabled, stats), sessionUser, configuration, hdfsEnvironment, startRowOffsetOfFile);
boolean eagerload = false;
if (indexes.isPresent()) {
eagerload = indexes.get().stream().anyMatch(indexMetadata -> EAGER_LOAD_INDEX_ID.contains(indexMetadata.getIndex().getId()));
}
return new OrcPageSource(recordReader, columnAdaptations, orcDataSource, deletedRows, eagerload, systemMemoryUsage, stats);
} catch (Exception e) {
try {
orcDataSource.close();
} catch (IOException ignored) {
}
if (e instanceof PrestoException) {
throw (PrestoException) e;
}
String message = splitError(e, path, start, length);
if (e instanceof BlockMissingException) {
throw new PrestoException(HIVE_MISSING_DATA, message, e);
}
throw new PrestoException(HIVE_CANNOT_OPEN_SPLIT, message, e);
}
}
Aggregations