use of io.trino.plugin.hive.AcidInfo in project trino by trinodb.
the class OrcPageSourceFactory method createOrcPageSource.
private ConnectorPageSource createOrcPageSource(HdfsEnvironment hdfsEnvironment, ConnectorIdentity identity, Configuration configuration, Path path, long start, long length, long estimatedFileSize, List<HiveColumnHandle> columns, List<HiveColumnHandle> projections, boolean useOrcColumnNames, boolean isFullAcid, TupleDomain<HiveColumnHandle> effectivePredicate, DateTimeZone legacyFileTimeZone, OrcReaderOptions options, Optional<AcidInfo> acidInfo, OptionalInt bucketNumber, boolean originalFile, AcidTransaction transaction, FileFormatDataSourceStats stats) {
for (HiveColumnHandle column : columns) {
checkArgument(column.getColumnType() == REGULAR, "column type must be regular: %s", column);
}
checkArgument(!effectivePredicate.isNone());
OrcDataSource orcDataSource;
boolean originalFilesPresent = acidInfo.isPresent() && !acidInfo.get().getOriginalFiles().isEmpty();
try {
FileSystem fileSystem = hdfsEnvironment.getFileSystem(identity, path, configuration);
FSDataInputStream inputStream = hdfsEnvironment.doAs(identity, () -> fileSystem.open(path));
orcDataSource = new HdfsOrcDataSource(new OrcDataSourceId(path.toString()), estimatedFileSize, options, inputStream, stats);
} catch (Exception e) {
if (nullToEmpty(e.getMessage()).trim().equals("Filesystem closed") || e instanceof FileNotFoundException) {
throw new TrinoException(HIVE_CANNOT_OPEN_SPLIT, e);
}
throw new TrinoException(HIVE_CANNOT_OPEN_SPLIT, splitError(e, path, start, length), e);
}
AggregatedMemoryContext memoryUsage = newSimpleAggregatedMemoryContext();
try {
Optional<OrcReader> optionalOrcReader = OrcReader.createOrcReader(orcDataSource, options);
if (optionalOrcReader.isEmpty()) {
return new EmptyPageSource();
}
OrcReader reader = optionalOrcReader.get();
if (!originalFile && acidInfo.isPresent() && !acidInfo.get().isOrcAcidVersionValidated()) {
validateOrcAcidVersion(path, reader);
}
List<OrcColumn> fileColumns = reader.getRootColumn().getNestedColumns();
int actualColumnCount = columns.size() + (isFullAcid ? 3 : 0);
List<OrcColumn> fileReadColumns = new ArrayList<>(actualColumnCount);
List<Type> fileReadTypes = new ArrayList<>(actualColumnCount);
List<OrcReader.ProjectedLayout> fileReadLayouts = new ArrayList<>(actualColumnCount);
if (isFullAcid && !originalFilesPresent) {
verifyAcidSchema(reader, path);
Map<String, OrcColumn> acidColumnsByName = uniqueIndex(fileColumns, orcColumn -> orcColumn.getColumnName().toLowerCase(ENGLISH));
fileColumns = ensureColumnNameConsistency(acidColumnsByName.get(AcidSchema.ACID_COLUMN_ROW_STRUCT.toLowerCase(ENGLISH)).getNestedColumns(), columns);
fileReadColumns.add(acidColumnsByName.get(AcidSchema.ACID_COLUMN_ORIGINAL_TRANSACTION.toLowerCase(ENGLISH)));
fileReadTypes.add(BIGINT);
fileReadLayouts.add(fullyProjectedLayout());
fileReadColumns.add(acidColumnsByName.get(AcidSchema.ACID_COLUMN_BUCKET.toLowerCase(ENGLISH)));
fileReadTypes.add(INTEGER);
fileReadLayouts.add(fullyProjectedLayout());
fileReadColumns.add(acidColumnsByName.get(AcidSchema.ACID_COLUMN_ROW_ID.toLowerCase(ENGLISH)));
fileReadTypes.add(BIGINT);
fileReadLayouts.add(fullyProjectedLayout());
}
Map<String, OrcColumn> fileColumnsByName = ImmutableMap.of();
if (useOrcColumnNames || isFullAcid) {
verifyFileHasColumnNames(fileColumns, path);
// Convert column names read from ORC files to lower case to be consistent with those stored in Hive Metastore
fileColumnsByName = uniqueIndex(fileColumns, orcColumn -> orcColumn.getColumnName().toLowerCase(ENGLISH));
}
Map<String, List<List<String>>> projectionsByColumnName = ImmutableMap.of();
Map<Integer, List<List<String>>> projectionsByColumnIndex = ImmutableMap.of();
if (useOrcColumnNames || isFullAcid) {
projectionsByColumnName = projections.stream().collect(Collectors.groupingBy(HiveColumnHandle::getBaseColumnName, mapping(OrcPageSourceFactory::getDereferencesAsList, toList())));
} else {
projectionsByColumnIndex = projections.stream().collect(Collectors.groupingBy(HiveColumnHandle::getBaseHiveColumnIndex, mapping(OrcPageSourceFactory::getDereferencesAsList, toList())));
}
TupleDomainOrcPredicateBuilder predicateBuilder = TupleDomainOrcPredicate.builder().setBloomFiltersEnabled(options.isBloomFiltersEnabled()).setDomainCompactionThreshold(domainCompactionThreshold);
Map<HiveColumnHandle, Domain> effectivePredicateDomains = effectivePredicate.getDomains().orElseThrow(() -> new IllegalArgumentException("Effective predicate is none"));
List<ColumnAdaptation> columnAdaptations = new ArrayList<>(columns.size());
for (HiveColumnHandle column : columns) {
OrcColumn orcColumn = null;
OrcReader.ProjectedLayout projectedLayout = null;
Map<Optional<HiveColumnProjectionInfo>, Domain> columnDomains = null;
if (useOrcColumnNames || isFullAcid) {
String columnName = column.getName().toLowerCase(ENGLISH);
orcColumn = fileColumnsByName.get(columnName);
if (orcColumn != null) {
projectedLayout = createProjectedLayout(orcColumn, projectionsByColumnName.get(columnName));
columnDomains = effectivePredicateDomains.entrySet().stream().filter(columnDomain -> columnDomain.getKey().getBaseColumnName().toLowerCase(ENGLISH).equals(columnName)).collect(toImmutableMap(columnDomain -> columnDomain.getKey().getHiveColumnProjectionInfo(), Map.Entry::getValue));
}
} else if (column.getBaseHiveColumnIndex() < fileColumns.size()) {
orcColumn = fileColumns.get(column.getBaseHiveColumnIndex());
if (orcColumn != null) {
projectedLayout = createProjectedLayout(orcColumn, projectionsByColumnIndex.get(column.getBaseHiveColumnIndex()));
columnDomains = effectivePredicateDomains.entrySet().stream().filter(columnDomain -> columnDomain.getKey().getBaseHiveColumnIndex() == column.getBaseHiveColumnIndex()).collect(toImmutableMap(columnDomain -> columnDomain.getKey().getHiveColumnProjectionInfo(), Map.Entry::getValue));
}
}
Type readType = column.getType();
if (orcColumn != null) {
int sourceIndex = fileReadColumns.size();
columnAdaptations.add(ColumnAdaptation.sourceColumn(sourceIndex));
fileReadColumns.add(orcColumn);
fileReadTypes.add(readType);
fileReadLayouts.add(projectedLayout);
// Add predicates on top-level and nested columns
for (Map.Entry<Optional<HiveColumnProjectionInfo>, Domain> columnDomain : columnDomains.entrySet()) {
OrcColumn nestedColumn = getNestedColumn(orcColumn, columnDomain.getKey());
if (nestedColumn != null) {
predicateBuilder.addColumn(nestedColumn.getColumnId(), columnDomain.getValue());
}
}
} else {
columnAdaptations.add(ColumnAdaptation.nullColumn(readType));
}
}
OrcRecordReader recordReader = reader.createRecordReader(fileReadColumns, fileReadTypes, fileReadLayouts, predicateBuilder.build(), start, length, legacyFileTimeZone, memoryUsage, INITIAL_BATCH_SIZE, exception -> handleException(orcDataSource.getId(), exception), NameBasedFieldMapper::create);
Optional<OrcDeletedRows> deletedRows = acidInfo.map(info -> new OrcDeletedRows(path.getName(), new OrcDeleteDeltaPageSourceFactory(options, identity, configuration, hdfsEnvironment, stats), identity, configuration, hdfsEnvironment, info, bucketNumber, memoryUsage));
Optional<Long> originalFileRowId = acidInfo.filter(OrcPageSourceFactory::hasOriginalFiles).map(info -> OriginalFilesUtils.getPrecedingRowCount(acidInfo.get().getOriginalFiles(), path, hdfsEnvironment, identity, options, configuration, stats));
if (transaction.isDelete()) {
if (originalFile) {
int bucket = bucketNumber.orElse(0);
long startingRowId = originalFileRowId.orElse(0L);
columnAdaptations.add(ColumnAdaptation.originalFileRowIdColumn(startingRowId, bucket));
} else {
columnAdaptations.add(ColumnAdaptation.rowIdColumn());
}
} else if (transaction.isUpdate()) {
HiveUpdateProcessor updateProcessor = transaction.getUpdateProcessor().orElseThrow(() -> new IllegalArgumentException("updateProcessor not present"));
List<HiveColumnHandle> dependencyColumns = projections.stream().filter(HiveColumnHandle::isBaseColumn).collect(toImmutableList());
if (originalFile) {
int bucket = bucketNumber.orElse(0);
long startingRowId = originalFileRowId.orElse(0L);
columnAdaptations.add(updatedRowColumnsWithOriginalFiles(startingRowId, bucket, updateProcessor, dependencyColumns));
} else {
columnAdaptations.add(updatedRowColumns(updateProcessor, dependencyColumns));
}
}
return new OrcPageSource(recordReader, columnAdaptations, orcDataSource, deletedRows, originalFileRowId, memoryUsage, stats);
} catch (Exception e) {
try {
orcDataSource.close();
} catch (IOException ignored) {
}
if (e instanceof TrinoException) {
throw (TrinoException) e;
}
String message = splitError(e, path, start, length);
if (e instanceof BlockMissingException) {
throw new TrinoException(HIVE_MISSING_DATA, message, e);
}
throw new TrinoException(HIVE_CANNOT_OPEN_SPLIT, message, e);
}
}
use of io.trino.plugin.hive.AcidInfo in project trino by trinodb.
the class InternalHiveSplitFactory method createInternalHiveSplit.
private Optional<InternalHiveSplit> createInternalHiveSplit(Path path, BlockLocation[] blockLocations, long start, long length, // Estimated because, for example, encrypted S3 files may be padded, so reported size may not reflect actual size
long estimatedFileSize, long fileModificationTime, OptionalInt bucketNumber, boolean splittable, Optional<AcidInfo> acidInfo) {
String pathString = path.toString();
if (!pathMatchesPredicate(pathDomain, pathString)) {
return Optional.empty();
}
// per HIVE-13040 empty files are allowed
if (estimatedFileSize == 0) {
return Optional.empty();
}
// but it might be ready when splits are enumerated lazily.
if (!partitionMatchSupplier.getAsBoolean()) {
return Optional.empty();
}
if (maxSplitFileSize.isPresent() && estimatedFileSize > maxSplitFileSize.get()) {
return Optional.empty();
}
ImmutableList.Builder<InternalHiveBlock> blockBuilder = ImmutableList.builder();
for (BlockLocation blockLocation : blockLocations) {
// clamp the block range
long blockStart = Math.max(start, blockLocation.getOffset());
long blockEnd = Math.min(start + length, blockLocation.getOffset() + blockLocation.getLength());
if (blockStart > blockEnd) {
// block is outside split range
continue;
}
if (blockStart == blockEnd && !(blockStart == start && blockEnd == start + length)) {
// skip zero-width block, except in the special circumstance: slice is empty, and the block covers the empty slice interval.
continue;
}
blockBuilder.add(new InternalHiveBlock(blockStart, blockEnd, getHostAddresses(blockLocation)));
}
List<InternalHiveBlock> blocks = blockBuilder.build();
checkBlocks(path, blocks, start, length);
if (!splittable) {
// not splittable, use the hosts from the first block if it exists
blocks = ImmutableList.of(new InternalHiveBlock(start, start + length, blocks.get(0).getAddresses()));
}
int bucketNumberIndex = bucketNumber.orElse(0);
return Optional.of(new InternalHiveSplit(partitionName, pathString, start, start + length, estimatedFileSize, fileModificationTime, schema, partitionKeys, blocks, bucketNumber, () -> bucketStatementCounters.computeIfAbsent(bucketNumberIndex, index -> new AtomicInteger()).getAndIncrement(), splittable, forceLocalScheduling && allBlocksHaveAddress(blocks), tableToPartitionMapping, bucketConversion, bucketValidation, s3SelectPushdownEnabled && S3SelectPushdown.isCompressionCodecSupported(inputFormat, path), acidInfo, partitionMatchSupplier));
}
use of io.trino.plugin.hive.AcidInfo in project trino by trinodb.
the class TestOrcPageSourceFactory method readFile.
private static List<Nation> readFile(Map<NationColumn, Integer> columns, OptionalLong nationKeyPredicate, Optional<AcidInfo> acidInfo, String filePath, long fileSize) {
TupleDomain<HiveColumnHandle> tupleDomain = TupleDomain.all();
if (nationKeyPredicate.isPresent()) {
tupleDomain = TupleDomain.withColumnDomains(ImmutableMap.of(toHiveColumnHandle(NATION_KEY, 0), Domain.singleValue(INTEGER, nationKeyPredicate.getAsLong())));
}
List<HiveColumnHandle> columnHandles = columns.entrySet().stream().map(entry -> toHiveColumnHandle(entry.getKey(), entry.getValue())).collect(toImmutableList());
List<String> columnNames = columnHandles.stream().map(HiveColumnHandle::getName).collect(toImmutableList());
Optional<ReaderPageSource> pageSourceWithProjections = PAGE_SOURCE_FACTORY.createPageSource(new JobConf(new Configuration(false)), SESSION, new Path(filePath), 0, fileSize, fileSize, createSchema(), columnHandles, tupleDomain, acidInfo, OptionalInt.empty(), false, NO_ACID_TRANSACTION);
checkArgument(pageSourceWithProjections.isPresent());
checkArgument(pageSourceWithProjections.get().getReaderColumns().isEmpty(), "projected columns not expected here");
ConnectorPageSource pageSource = pageSourceWithProjections.get().get();
int nationKeyColumn = columnNames.indexOf("n_nationkey");
int nameColumn = columnNames.indexOf("n_name");
int regionKeyColumn = columnNames.indexOf("n_regionkey");
int commentColumn = columnNames.indexOf("n_comment");
ImmutableList.Builder<Nation> rows = ImmutableList.builder();
while (!pageSource.isFinished()) {
Page page = pageSource.getNextPage();
if (page == null) {
continue;
}
page = page.getLoadedPage();
for (int position = 0; position < page.getPositionCount(); position++) {
long nationKey = -42;
if (nationKeyColumn >= 0) {
nationKey = BIGINT.getLong(page.getBlock(nationKeyColumn), position);
}
String name = "<not read>";
if (nameColumn >= 0) {
name = VARCHAR.getSlice(page.getBlock(nameColumn), position).toStringUtf8();
}
long regionKey = -42;
if (regionKeyColumn >= 0) {
regionKey = BIGINT.getLong(page.getBlock(regionKeyColumn), position);
}
String comment = "<not read>";
if (commentColumn >= 0) {
comment = VARCHAR.getSlice(page.getBlock(commentColumn), position).toStringUtf8();
}
rows.add(new Nation(position, nationKey, name, regionKey, comment));
}
}
return rows.build();
}
use of io.trino.plugin.hive.AcidInfo in project trino by trinodb.
the class TestOrcPageSourceFactory method testFullFileReadOriginalFilesTable.
@Test
public void testFullFileReadOriginalFilesTable() throws Exception {
File tableFile = new File(getResource("fullacidNationTableWithOriginalFiles/000000_0").toURI());
String tablePath = tableFile.getParent();
AcidInfo acidInfo = AcidInfo.builder(new Path(tablePath)).addDeleteDelta(new Path(tablePath, deleteDeltaSubdir(10000001, 10000001, 0))).addOriginalFile(new Path(tablePath, "000000_0"), 1780, 0).setOrcAcidVersionValidated(true).buildWithRequiredOriginalFiles(0);
List<Nation> expected = expectedResult(OptionalLong.empty(), nationKey -> nationKey == 24, 1);
List<Nation> result = readFile(ALL_COLUMNS, OptionalLong.empty(), Optional.of(acidInfo), tablePath + "/000000_0", 1780);
assertEquals(result.size(), expected.size());
int deletedRowKey = 24;
String deletedRowNameColumn = "UNITED STATES";
assertFalse(result.stream().anyMatch(acidNationRow -> acidNationRow.getName().equals(deletedRowNameColumn) && acidNationRow.getNationKey() == deletedRowKey), "Deleted row shouldn't be present in the result");
}
use of io.trino.plugin.hive.AcidInfo in project trino by trinodb.
the class TestOrcPageSourceFactory method testReadWithAcidVersionValidationNoVersionInMetadata.
@Test
public void testReadWithAcidVersionValidationNoVersionInMetadata() throws Exception {
File tableFile = new File(getResource("acid_version_validation/no_orc_acid_version_in_metadata/00000_0").toURI());
String tablePath = tableFile.getParent();
Optional<AcidInfo> acidInfo = AcidInfo.builder(new Path(tablePath)).setOrcAcidVersionValidated(false).build();
Assertions.assertThatThrownBy(() -> readFile(Map.of(), OptionalLong.empty(), acidInfo, tableFile.getPath(), 730)).hasMessageMatching("Hive transactional tables are supported since Hive 3.0. Expected `hive.acid.version` in ORC metadata" + " in .*/acid_version_validation/no_orc_acid_version_in_metadata/00000_0 to be >=2 but was <empty>." + " If you have upgraded from an older version of Hive, make sure a major compaction has been run at least once after the upgrade.");
}
Aggregations