use of io.trino.spi.connector.EmptyPageSource in project trino by trinodb.
the class SystemPageSourceProvider method createPageSource.
@Override
public ConnectorPageSource createPageSource(ConnectorTransactionHandle transaction, ConnectorSession session, ConnectorSplit split, ConnectorTableHandle table, List<ColumnHandle> columns, DynamicFilter dynamicFilter) {
requireNonNull(columns, "columns is null");
SystemTransactionHandle systemTransaction = (SystemTransactionHandle) transaction;
SystemSplit systemSplit = (SystemSplit) split;
SchemaTableName tableName = ((SystemTableHandle) table).getSchemaTableName();
SystemTable systemTable = tables.getSystemTable(session, tableName).orElseThrow(() -> new TrinoException(NOT_FOUND, format("Table '%s' not found", tableName)));
List<ColumnMetadata> tableColumns = systemTable.getTableMetadata().getColumns();
Map<String, Integer> columnsByName = new HashMap<>();
for (int i = 0; i < tableColumns.size(); i++) {
ColumnMetadata column = tableColumns.get(i);
if (columnsByName.put(column.getName(), i) != null) {
throw new TrinoException(GENERIC_INTERNAL_ERROR, "Duplicate column name: " + column.getName());
}
}
ImmutableList.Builder<Integer> userToSystemFieldIndex = ImmutableList.builder();
for (ColumnHandle column : columns) {
String columnName = ((SystemColumnHandle) column).getColumnName();
Integer index = columnsByName.get(columnName);
if (index == null) {
throw new TrinoException(GENERIC_INTERNAL_ERROR, format("Column does not exist: %s.%s", tableName, columnName));
}
userToSystemFieldIndex.add(index);
}
TupleDomain<ColumnHandle> constraint = systemSplit.getConstraint();
if (constraint.isNone()) {
return new EmptyPageSource();
}
TupleDomain<Integer> newConstraint = systemSplit.getConstraint().transformKeys(columnHandle -> columnsByName.get(((SystemColumnHandle) columnHandle).getColumnName()));
try {
return new MappedPageSource(systemTable.pageSource(systemTransaction.getConnectorTransactionHandle(), session, newConstraint), userToSystemFieldIndex.build());
} catch (UnsupportedOperationException e) {
return new RecordPageSource(new MappedRecordSet(toRecordSet(systemTransaction.getConnectorTransactionHandle(), systemTable, session, newConstraint), userToSystemFieldIndex.build()));
}
}
use of io.trino.spi.connector.EmptyPageSource in project trino by trinodb.
the class PageSourceManager method createPageSource.
@Override
public ConnectorPageSource createPageSource(Session session, Split split, TableHandle table, List<ColumnHandle> columns, DynamicFilter dynamicFilter) {
requireNonNull(columns, "columns is null");
checkArgument(split.getCatalogName().equals(table.getCatalogName()), "mismatched split and table");
CatalogName catalogName = split.getCatalogName();
ConnectorPageSourceProvider provider = getPageSourceProvider(catalogName);
TupleDomain<ColumnHandle> constraint = dynamicFilter.getCurrentPredicate();
if (constraint.isNone()) {
return new EmptyPageSource();
}
if (!isAllowPushdownIntoConnectors(session)) {
dynamicFilter = DynamicFilter.EMPTY;
}
return provider.createPageSource(table.getTransaction(), session.toConnectorSession(catalogName), split.getConnectorSplit(), table.getConnectorHandle(), columns, dynamicFilter);
}
use of io.trino.spi.connector.EmptyPageSource in project trino by trinodb.
the class HivePageSourceProvider method createPageSource.
@Override
public ConnectorPageSource createPageSource(ConnectorTransactionHandle transaction, ConnectorSession session, ConnectorSplit split, ConnectorTableHandle tableHandle, List<ColumnHandle> columns, DynamicFilter dynamicFilter) {
HiveTableHandle hiveTable = (HiveTableHandle) tableHandle;
HiveSplit hiveSplit = (HiveSplit) split;
if (shouldSkipBucket(hiveTable, hiveSplit, dynamicFilter)) {
return new EmptyPageSource();
}
List<HiveColumnHandle> hiveColumns = columns.stream().map(HiveColumnHandle.class::cast).collect(toList());
List<HiveColumnHandle> dependencyColumns = hiveColumns.stream().filter(HiveColumnHandle::isBaseColumn).collect(toImmutableList());
if (hiveTable.isAcidUpdate()) {
hiveColumns = hiveTable.getUpdateProcessor().orElseThrow(() -> new IllegalArgumentException("update processor not present")).mergeWithNonUpdatedColumns(hiveColumns);
}
Path path = new Path(hiveSplit.getPath());
boolean originalFile = ORIGINAL_FILE_PATH_MATCHER.matcher(path.toString()).matches();
List<ColumnMapping> columnMappings = ColumnMapping.buildColumnMappings(hiveSplit.getPartitionName(), hiveSplit.getPartitionKeys(), hiveColumns, hiveSplit.getBucketConversion().map(BucketConversion::getBucketColumnHandles).orElse(ImmutableList.of()), hiveSplit.getTableToPartitionMapping(), path, hiveSplit.getBucketNumber(), hiveSplit.getEstimatedFileSize(), hiveSplit.getFileModifiedTime());
// This can happen when dynamic filters are collected after partition splits were listed.
if (shouldSkipSplit(columnMappings, dynamicFilter)) {
return new EmptyPageSource();
}
Configuration configuration = hdfsEnvironment.getConfiguration(new HdfsContext(session), path);
TupleDomain<HiveColumnHandle> simplifiedDynamicFilter = dynamicFilter.getCurrentPredicate().transformKeys(HiveColumnHandle.class::cast).simplify(domainCompactionThreshold);
Optional<ConnectorPageSource> pageSource = createHivePageSource(pageSourceFactories, cursorProviders, configuration, session, path, hiveSplit.getBucketNumber(), hiveSplit.getStart(), hiveSplit.getLength(), hiveSplit.getEstimatedFileSize(), hiveSplit.getSchema(), hiveTable.getCompactEffectivePredicate().intersect(simplifiedDynamicFilter), hiveColumns, typeManager, hiveSplit.getBucketConversion(), hiveSplit.getBucketValidation(), hiveSplit.isS3SelectPushdownEnabled(), hiveSplit.getAcidInfo(), originalFile, hiveTable.getTransaction(), columnMappings);
if (pageSource.isPresent()) {
ConnectorPageSource source = pageSource.get();
if (hiveTable.isAcidDelete() || hiveTable.isAcidUpdate()) {
checkArgument(orcFileWriterFactory.isPresent(), "orcFileWriterFactory not supplied but required for DELETE and UPDATE");
HivePageSource hivePageSource = (HivePageSource) source;
OrcPageSource orcPageSource = (OrcPageSource) hivePageSource.getDelegate();
ColumnMetadata<OrcType> columnMetadata = orcPageSource.getColumnTypes();
int acidRowColumnId = originalFile ? 0 : ACID_ROW_STRUCT_COLUMN_ID;
HiveType rowType = fromOrcTypeToHiveType(columnMetadata.get(new OrcColumnId(acidRowColumnId)), columnMetadata);
long currentSplitNumber = hiveSplit.getSplitNumber();
if (currentSplitNumber >= MAX_NUMBER_OF_SPLITS) {
throw new TrinoException(GENERIC_INSUFFICIENT_RESOURCES, format("Number of splits is higher than maximum possible number of splits %d", MAX_NUMBER_OF_SPLITS));
}
long initialRowId = currentSplitNumber << PER_SPLIT_ROW_ID_BITS;
return new HiveUpdatablePageSource(hiveTable, hiveSplit.getPartitionName(), hiveSplit.getStatementId(), source, typeManager, hiveSplit.getBucketNumber(), path, originalFile, orcFileWriterFactory.get(), configuration, session, rowType, dependencyColumns, hiveTable.getTransaction().getOperation(), initialRowId, MAX_NUMBER_OF_ROWS_PER_SPLIT);
}
return source;
}
throw new RuntimeException("Could not find a file reader for split " + hiveSplit);
}
use of io.trino.spi.connector.EmptyPageSource in project trino by trinodb.
the class OrcPageSourceFactory method createOrcPageSource.
private ConnectorPageSource createOrcPageSource(HdfsEnvironment hdfsEnvironment, ConnectorIdentity identity, Configuration configuration, Path path, long start, long length, long estimatedFileSize, List<HiveColumnHandle> columns, List<HiveColumnHandle> projections, boolean useOrcColumnNames, boolean isFullAcid, TupleDomain<HiveColumnHandle> effectivePredicate, DateTimeZone legacyFileTimeZone, OrcReaderOptions options, Optional<AcidInfo> acidInfo, OptionalInt bucketNumber, boolean originalFile, AcidTransaction transaction, FileFormatDataSourceStats stats) {
for (HiveColumnHandle column : columns) {
checkArgument(column.getColumnType() == REGULAR, "column type must be regular: %s", column);
}
checkArgument(!effectivePredicate.isNone());
OrcDataSource orcDataSource;
boolean originalFilesPresent = acidInfo.isPresent() && !acidInfo.get().getOriginalFiles().isEmpty();
try {
FileSystem fileSystem = hdfsEnvironment.getFileSystem(identity, path, configuration);
FSDataInputStream inputStream = hdfsEnvironment.doAs(identity, () -> fileSystem.open(path));
orcDataSource = new HdfsOrcDataSource(new OrcDataSourceId(path.toString()), estimatedFileSize, options, inputStream, stats);
} catch (Exception e) {
if (nullToEmpty(e.getMessage()).trim().equals("Filesystem closed") || e instanceof FileNotFoundException) {
throw new TrinoException(HIVE_CANNOT_OPEN_SPLIT, e);
}
throw new TrinoException(HIVE_CANNOT_OPEN_SPLIT, splitError(e, path, start, length), e);
}
AggregatedMemoryContext memoryUsage = newSimpleAggregatedMemoryContext();
try {
Optional<OrcReader> optionalOrcReader = OrcReader.createOrcReader(orcDataSource, options);
if (optionalOrcReader.isEmpty()) {
return new EmptyPageSource();
}
OrcReader reader = optionalOrcReader.get();
if (!originalFile && acidInfo.isPresent() && !acidInfo.get().isOrcAcidVersionValidated()) {
validateOrcAcidVersion(path, reader);
}
List<OrcColumn> fileColumns = reader.getRootColumn().getNestedColumns();
int actualColumnCount = columns.size() + (isFullAcid ? 3 : 0);
List<OrcColumn> fileReadColumns = new ArrayList<>(actualColumnCount);
List<Type> fileReadTypes = new ArrayList<>(actualColumnCount);
List<OrcReader.ProjectedLayout> fileReadLayouts = new ArrayList<>(actualColumnCount);
if (isFullAcid && !originalFilesPresent) {
verifyAcidSchema(reader, path);
Map<String, OrcColumn> acidColumnsByName = uniqueIndex(fileColumns, orcColumn -> orcColumn.getColumnName().toLowerCase(ENGLISH));
fileColumns = ensureColumnNameConsistency(acidColumnsByName.get(AcidSchema.ACID_COLUMN_ROW_STRUCT.toLowerCase(ENGLISH)).getNestedColumns(), columns);
fileReadColumns.add(acidColumnsByName.get(AcidSchema.ACID_COLUMN_ORIGINAL_TRANSACTION.toLowerCase(ENGLISH)));
fileReadTypes.add(BIGINT);
fileReadLayouts.add(fullyProjectedLayout());
fileReadColumns.add(acidColumnsByName.get(AcidSchema.ACID_COLUMN_BUCKET.toLowerCase(ENGLISH)));
fileReadTypes.add(INTEGER);
fileReadLayouts.add(fullyProjectedLayout());
fileReadColumns.add(acidColumnsByName.get(AcidSchema.ACID_COLUMN_ROW_ID.toLowerCase(ENGLISH)));
fileReadTypes.add(BIGINT);
fileReadLayouts.add(fullyProjectedLayout());
}
Map<String, OrcColumn> fileColumnsByName = ImmutableMap.of();
if (useOrcColumnNames || isFullAcid) {
verifyFileHasColumnNames(fileColumns, path);
// Convert column names read from ORC files to lower case to be consistent with those stored in Hive Metastore
fileColumnsByName = uniqueIndex(fileColumns, orcColumn -> orcColumn.getColumnName().toLowerCase(ENGLISH));
}
Map<String, List<List<String>>> projectionsByColumnName = ImmutableMap.of();
Map<Integer, List<List<String>>> projectionsByColumnIndex = ImmutableMap.of();
if (useOrcColumnNames || isFullAcid) {
projectionsByColumnName = projections.stream().collect(Collectors.groupingBy(HiveColumnHandle::getBaseColumnName, mapping(OrcPageSourceFactory::getDereferencesAsList, toList())));
} else {
projectionsByColumnIndex = projections.stream().collect(Collectors.groupingBy(HiveColumnHandle::getBaseHiveColumnIndex, mapping(OrcPageSourceFactory::getDereferencesAsList, toList())));
}
TupleDomainOrcPredicateBuilder predicateBuilder = TupleDomainOrcPredicate.builder().setBloomFiltersEnabled(options.isBloomFiltersEnabled()).setDomainCompactionThreshold(domainCompactionThreshold);
Map<HiveColumnHandle, Domain> effectivePredicateDomains = effectivePredicate.getDomains().orElseThrow(() -> new IllegalArgumentException("Effective predicate is none"));
List<ColumnAdaptation> columnAdaptations = new ArrayList<>(columns.size());
for (HiveColumnHandle column : columns) {
OrcColumn orcColumn = null;
OrcReader.ProjectedLayout projectedLayout = null;
Map<Optional<HiveColumnProjectionInfo>, Domain> columnDomains = null;
if (useOrcColumnNames || isFullAcid) {
String columnName = column.getName().toLowerCase(ENGLISH);
orcColumn = fileColumnsByName.get(columnName);
if (orcColumn != null) {
projectedLayout = createProjectedLayout(orcColumn, projectionsByColumnName.get(columnName));
columnDomains = effectivePredicateDomains.entrySet().stream().filter(columnDomain -> columnDomain.getKey().getBaseColumnName().toLowerCase(ENGLISH).equals(columnName)).collect(toImmutableMap(columnDomain -> columnDomain.getKey().getHiveColumnProjectionInfo(), Map.Entry::getValue));
}
} else if (column.getBaseHiveColumnIndex() < fileColumns.size()) {
orcColumn = fileColumns.get(column.getBaseHiveColumnIndex());
if (orcColumn != null) {
projectedLayout = createProjectedLayout(orcColumn, projectionsByColumnIndex.get(column.getBaseHiveColumnIndex()));
columnDomains = effectivePredicateDomains.entrySet().stream().filter(columnDomain -> columnDomain.getKey().getBaseHiveColumnIndex() == column.getBaseHiveColumnIndex()).collect(toImmutableMap(columnDomain -> columnDomain.getKey().getHiveColumnProjectionInfo(), Map.Entry::getValue));
}
}
Type readType = column.getType();
if (orcColumn != null) {
int sourceIndex = fileReadColumns.size();
columnAdaptations.add(ColumnAdaptation.sourceColumn(sourceIndex));
fileReadColumns.add(orcColumn);
fileReadTypes.add(readType);
fileReadLayouts.add(projectedLayout);
// Add predicates on top-level and nested columns
for (Map.Entry<Optional<HiveColumnProjectionInfo>, Domain> columnDomain : columnDomains.entrySet()) {
OrcColumn nestedColumn = getNestedColumn(orcColumn, columnDomain.getKey());
if (nestedColumn != null) {
predicateBuilder.addColumn(nestedColumn.getColumnId(), columnDomain.getValue());
}
}
} else {
columnAdaptations.add(ColumnAdaptation.nullColumn(readType));
}
}
OrcRecordReader recordReader = reader.createRecordReader(fileReadColumns, fileReadTypes, fileReadLayouts, predicateBuilder.build(), start, length, legacyFileTimeZone, memoryUsage, INITIAL_BATCH_SIZE, exception -> handleException(orcDataSource.getId(), exception), NameBasedFieldMapper::create);
Optional<OrcDeletedRows> deletedRows = acidInfo.map(info -> new OrcDeletedRows(path.getName(), new OrcDeleteDeltaPageSourceFactory(options, identity, configuration, hdfsEnvironment, stats), identity, configuration, hdfsEnvironment, info, bucketNumber, memoryUsage));
Optional<Long> originalFileRowId = acidInfo.filter(OrcPageSourceFactory::hasOriginalFiles).map(info -> OriginalFilesUtils.getPrecedingRowCount(acidInfo.get().getOriginalFiles(), path, hdfsEnvironment, identity, options, configuration, stats));
if (transaction.isDelete()) {
if (originalFile) {
int bucket = bucketNumber.orElse(0);
long startingRowId = originalFileRowId.orElse(0L);
columnAdaptations.add(ColumnAdaptation.originalFileRowIdColumn(startingRowId, bucket));
} else {
columnAdaptations.add(ColumnAdaptation.rowIdColumn());
}
} else if (transaction.isUpdate()) {
HiveUpdateProcessor updateProcessor = transaction.getUpdateProcessor().orElseThrow(() -> new IllegalArgumentException("updateProcessor not present"));
List<HiveColumnHandle> dependencyColumns = projections.stream().filter(HiveColumnHandle::isBaseColumn).collect(toImmutableList());
if (originalFile) {
int bucket = bucketNumber.orElse(0);
long startingRowId = originalFileRowId.orElse(0L);
columnAdaptations.add(updatedRowColumnsWithOriginalFiles(startingRowId, bucket, updateProcessor, dependencyColumns));
} else {
columnAdaptations.add(updatedRowColumns(updateProcessor, dependencyColumns));
}
}
return new OrcPageSource(recordReader, columnAdaptations, orcDataSource, deletedRows, originalFileRowId, memoryUsage, stats);
} catch (Exception e) {
try {
orcDataSource.close();
} catch (IOException ignored) {
}
if (e instanceof TrinoException) {
throw (TrinoException) e;
}
String message = splitError(e, path, start, length);
if (e instanceof BlockMissingException) {
throw new TrinoException(HIVE_MISSING_DATA, message, e);
}
throw new TrinoException(HIVE_CANNOT_OPEN_SPLIT, message, e);
}
}
use of io.trino.spi.connector.EmptyPageSource in project trino by trinodb.
the class RcFilePageSourceFactory method createPageSource.
@Override
public Optional<ReaderPageSource> createPageSource(Configuration configuration, ConnectorSession session, Path path, long start, long length, long estimatedFileSize, Properties schema, List<HiveColumnHandle> columns, TupleDomain<HiveColumnHandle> effectivePredicate, Optional<AcidInfo> acidInfo, OptionalInt bucketNumber, boolean originalFile, AcidTransaction transaction) {
RcFileEncoding rcFileEncoding;
String deserializerClassName = getDeserializerClassName(schema);
if (deserializerClassName.equals(LazyBinaryColumnarSerDe.class.getName())) {
rcFileEncoding = new BinaryRcFileEncoding(timeZone);
} else if (deserializerClassName.equals(ColumnarSerDe.class.getName())) {
rcFileEncoding = createTextVectorEncoding(schema);
} else {
return Optional.empty();
}
checkArgument(acidInfo.isEmpty(), "Acid is not supported");
List<HiveColumnHandle> projectedReaderColumns = columns;
Optional<ReaderColumns> readerProjections = projectBaseColumns(columns);
if (readerProjections.isPresent()) {
projectedReaderColumns = readerProjections.get().get().stream().map(HiveColumnHandle.class::cast).collect(toImmutableList());
}
RcFileDataSource dataSource;
try {
FileSystem fileSystem = hdfsEnvironment.getFileSystem(session.getIdentity(), path, configuration);
FSDataInputStream inputStream = hdfsEnvironment.doAs(session.getIdentity(), () -> fileSystem.open(path));
if (estimatedFileSize < BUFFER_SIZE.toBytes()) {
// Handle potentially imprecise file lengths by reading the footer
try {
FSDataInputStreamTail fileTail = FSDataInputStreamTail.readTail(path.toString(), estimatedFileSize, inputStream, toIntExact(BUFFER_SIZE.toBytes()));
dataSource = new MemoryRcFileDataSource(new RcFileDataSourceId(path.toString()), fileTail.getTailSlice());
} finally {
inputStream.close();
}
} else {
long fileSize = hdfsEnvironment.doAs(session.getIdentity(), () -> fileSystem.getFileStatus(path).getLen());
dataSource = new HdfsRcFileDataSource(path.toString(), inputStream, fileSize, stats);
}
} catch (Exception e) {
if (nullToEmpty(e.getMessage()).trim().equals("Filesystem closed") || e instanceof FileNotFoundException) {
throw new TrinoException(HIVE_CANNOT_OPEN_SPLIT, e);
}
throw new TrinoException(HIVE_CANNOT_OPEN_SPLIT, splitError(e, path, start, length), e);
}
length = min(dataSource.getSize() - start, length);
// Split may be empty now that the correct file size is known
if (length <= 0) {
return Optional.of(noProjectionAdaptation(new EmptyPageSource()));
}
try {
ImmutableMap.Builder<Integer, Type> readColumns = ImmutableMap.builder();
HiveTimestampPrecision timestampPrecision = getTimestampPrecision(session);
for (HiveColumnHandle column : projectedReaderColumns) {
readColumns.put(column.getBaseHiveColumnIndex(), column.getHiveType().getType(typeManager, timestampPrecision));
}
RcFileReader rcFileReader = new RcFileReader(dataSource, rcFileEncoding, readColumns.buildOrThrow(), new AircompressorCodecFactory(new HadoopCodecFactory(configuration.getClassLoader())), start, length, BUFFER_SIZE);
ConnectorPageSource pageSource = new RcFilePageSource(rcFileReader, projectedReaderColumns);
return Optional.of(new ReaderPageSource(pageSource, readerProjections));
} catch (Throwable e) {
try {
dataSource.close();
} catch (IOException ignored) {
}
if (e instanceof TrinoException) {
throw (TrinoException) e;
}
String message = splitError(e, path, start, length);
if (e instanceof RcFileCorruptionException) {
throw new TrinoException(HIVE_BAD_DATA, message, e);
}
if (e instanceof BlockMissingException) {
throw new TrinoException(HIVE_MISSING_DATA, message, e);
}
throw new TrinoException(HIVE_CANNOT_OPEN_SPLIT, message, e);
}
}
Aggregations