use of io.prestosql.spi.heuristicindex.SplitMetadata in project hetu-core by openlookeng.
the class OrcRecordReader method nextPage.
public Page nextPage() throws IOException {
ColumnReader[] columnsReader = getColumnReaders();
int batchSize = prepareNextBatch();
if (batchSize < 0) {
return null;
}
for (ColumnReader column : columnsReader) {
if (column != null) {
column.prepareNextRead(batchSize);
}
}
batchRead(batchSize);
matchingRowsInBatchArray = null;
validateWritePageChecksum(batchSize);
// create a lazy page
blockFactory.nextPage();
Arrays.fill(currentBytesPerCell, 0);
Block[] blocks = new Block[columnsReader.length];
for (int i = 0; i < columnsReader.length; i++) {
int columnIndex = i;
blocks[columnIndex] = blockFactory.createBlock(batchSize, () -> filterRows(columnsReader[columnIndex].readBlock()), block -> blockLoaded(columnIndex, block));
}
// only include page metadata if enabled
if (pageMetadataEnabled) {
Properties pageMetadata = new Properties();
pageCount++;
pageMetadata.setProperty(DATASOURCE_PAGE_NUMBER, String.valueOf(pageCount));
if (isCurrentStripeFinished()) {
// Only set the total page count when the current stripe has finished
// Therefore whenever this property is available in pageMetaData,
// it indicates that the stripe has finished and this is the last page
pageMetadata.setProperty(DATASOURCE_TOTAL_PAGES, String.valueOf(pageCount));
pageCount = 0;
}
pageMetadata.setProperty(DATASOURCE_STRIPE_NUMBER, String.valueOf(currentStripe));
pageMetadata.setProperty(DATASOURCE_STRIPE_OFFSET, String.valueOf(stripes.get(currentStripe).getOffset()));
pageMetadata.setProperty(DATASOURCE_STRIPE_LENGTH, String.valueOf(stripes.get(currentStripe).getTotalLength()));
if (splitMetadata != null) {
// Skip setting for testing (splitMetadata set as null)
pageMetadata.setProperty(DATASOURCE_FILE_PATH, splitMetadata.getSplitIdentity());
pageMetadata.setProperty(DATASOURCE_FILE_MODIFICATION, String.valueOf(splitMetadata.getLastModifiedTime()));
}
pageMetadata.setProperty(DATASOURCE_INDEX_LEVEL, "STRIPE");
return new Page(batchSize, pageMetadata, blocks);
} else {
return new Page(batchSize, blocks);
}
}
use of io.prestosql.spi.heuristicindex.SplitMetadata in project hetu-core by openlookeng.
the class HivePageSourceProvider method createPageSourceInternal.
private ConnectorPageSource createPageSourceInternal(ConnectorSession session, Optional<DynamicFilterSupplier> dynamicFilterSupplier, List<Map<ColumnHandle, DynamicFilter>> dynamicFilters, HiveTableHandle hiveTable, List<HiveColumnHandle> hiveColumns, HiveSplit hiveSplit) {
Path path = new Path(hiveSplit.getPath());
List<Set<DynamicFilter>> dynamicFilterList = new ArrayList();
if (dynamicFilters != null) {
for (Map<ColumnHandle, DynamicFilter> df : dynamicFilters) {
Set<DynamicFilter> values = df.values().stream().collect(Collectors.toSet());
dynamicFilterList.add(values);
}
}
// Filter out splits using partition values and dynamic filters
if (dynamicFilters != null && !dynamicFilters.isEmpty() && isPartitionFiltered(hiveSplit.getPartitionKeys(), dynamicFilterList, typeManager)) {
return new FixedPageSource(ImmutableList.of());
}
Configuration configuration = hdfsEnvironment.getConfiguration(new HdfsEnvironment.HdfsContext(session, hiveSplit.getDatabase(), hiveSplit.getTable()), path);
Properties schema = hiveSplit.getSchema();
String columnNameDelimiter = schema.containsKey(serdeConstants.COLUMN_NAME_DELIMITER) ? schema.getProperty(serdeConstants.COLUMN_NAME_DELIMITER) : String.valueOf(SerDeUtils.COMMA);
List<String> partitionColumnNames;
if (schema.containsKey(META_PARTITION_COLUMNS)) {
partitionColumnNames = Arrays.asList(schema.getProperty(META_PARTITION_COLUMNS).split(columnNameDelimiter));
} else if (schema.containsKey(META_TABLE_COLUMNS)) {
partitionColumnNames = Arrays.asList(schema.getProperty(META_TABLE_COLUMNS).split(columnNameDelimiter));
} else {
partitionColumnNames = new ArrayList<>();
}
List<String> tableColumns = hiveColumns.stream().map(cols -> cols.getName()).collect(toList());
List<String> missingColumns = tableColumns.stream().skip(partitionColumnNames.size()).collect(toList());
List<IndexMetadata> indexes = new ArrayList<>();
if (indexCache != null && session.isHeuristicIndexFilterEnabled()) {
indexes.addAll(this.indexCache.getIndices(session.getCatalog().orElse(null), hiveTable.getSchemaTableName().toString(), hiveSplit, hiveTable.getCompactEffectivePredicate(), hiveTable.getPartitionColumns()));
/* Bloom/Bitmap indices are checked for given table and added to the possible matchers for pushdown. */
if (hiveTable.getDisjunctCompactEffectivePredicate().isPresent() && hiveTable.getDisjunctCompactEffectivePredicate().get().size() > 0) {
hiveTable.getDisjunctCompactEffectivePredicate().get().forEach(orPredicate -> indexes.addAll(this.indexCache.getIndices(session.getCatalog().orElse(null), hiveTable.getSchemaTableName().toString(), hiveSplit, orPredicate, hiveTable.getPartitionColumns())));
}
}
Optional<List<IndexMetadata>> indexOptional = indexes == null || indexes.isEmpty() ? Optional.empty() : Optional.of(indexes);
URI splitUri = URI.create(URIUtil.encodePath(hiveSplit.getPath()));
SplitMetadata splitMetadata = new SplitMetadata(splitUri.getRawPath(), hiveSplit.getLastModifiedTime());
TupleDomain<HiveColumnHandle> predicate = TupleDomain.all();
if (dynamicFilterSupplier.isPresent() && dynamicFilters != null && !dynamicFilters.isEmpty()) {
if (dynamicFilters.size() == 1) {
List<HiveColumnHandle> filteredHiveColumnHandles = hiveColumns.stream().filter(column -> dynamicFilters.get(0).containsKey(column)).collect(toList());
HiveColumnHandle hiveColumnHandle = filteredHiveColumnHandles.get(0);
Type type = hiveColumnHandle.getColumnMetadata(typeManager).getType();
predicate = getPredicate(dynamicFilters.get(0).get(hiveColumnHandle), type, hiveColumnHandle);
if (predicate.isNone()) {
predicate = TupleDomain.all();
}
}
}
/**
* This is main logical division point to process filter pushdown enabled case (aka as selective read flow).
* If user configuration orc_predicate_pushdown_enabled is true and if all clause of query can be handled by hive
* selective read flow, then hiveTable.isSuitableToPush() will be enabled.
* (Refer HiveMetadata.checkIfSuitableToPush).
*/
if (hiveTable.isSuitableToPush()) {
return createSelectivePageSource(selectivePageSourceFactories, configuration, session, hiveSplit, assignUniqueIndicesToPartitionColumns(hiveColumns), typeManager, dynamicFilterSupplier, hiveSplit.getDeleteDeltaLocations(), hiveSplit.getStartRowOffsetOfFile(), indexOptional, hiveSplit.isCacheable(), hiveTable.getCompactEffectivePredicate(), hiveTable.getPredicateColumns(), hiveTable.getDisjunctCompactEffectivePredicate(), hiveSplit.getBucketConversion(), hiveSplit.getBucketNumber(), hiveSplit.getLastModifiedTime(), missingColumns);
}
Optional<ConnectorPageSource> pageSource = createHivePageSource(cursorProviders, pageSourceFactories, configuration, session, path, hiveSplit.getBucketNumber(), hiveSplit.getStart(), hiveSplit.getLength(), hiveSplit.getFileSize(), hiveSplit.getSchema(), hiveTable.getCompactEffectivePredicate().intersect(predicate), hiveColumns, hiveSplit.getPartitionKeys(), typeManager, hiveSplit.getColumnCoercions(), hiveSplit.getBucketConversion(), hiveSplit.isS3SelectPushdownEnabled(), dynamicFilterSupplier, hiveSplit.getDeleteDeltaLocations(), hiveSplit.getStartRowOffsetOfFile(), indexOptional, splitMetadata, hiveSplit.isCacheable(), hiveSplit.getLastModifiedTime(), hiveSplit.getCustomSplitInfo(), missingColumns);
if (pageSource.isPresent()) {
return pageSource.get();
}
throw new RuntimeException("Could not find a file reader for split " + hiveSplit);
}
use of io.prestosql.spi.heuristicindex.SplitMetadata in project hetu-core by openlookeng.
the class OrcPageSourceFactory method createOrcPageSource.
public static OrcPageSource createOrcPageSource(HdfsEnvironment hdfsEnvironment, String sessionUser, Configuration configuration, Path path, long start, long length, long fileSize, List<HiveColumnHandle> columns, boolean useOrcColumnNames, boolean isFullAcid, TupleDomain<HiveColumnHandle> effectivePredicate, DateTimeZone legacyFileTimeZone, TypeManager typeManager, DataSize maxMergeDistance, DataSize maxBufferSize, DataSize streamBufferSize, DataSize tinyStripeThreshold, DataSize maxReadBlockSize, boolean lazyReadSmallRanges, boolean orcBloomFiltersEnabled, FileFormatDataSourceStats stats, Optional<DynamicFilterSupplier> dynamicFilters, Optional<DeleteDeltaLocations> deleteDeltaLocations, Optional<Long> startRowOffsetOfFile, Optional<List<IndexMetadata>> indexes, SplitMetadata splitMetadata, OrcCacheStore orcCacheStore, OrcCacheProperties orcCacheProperties, int domainCompactionThreshold, boolean pageMetadataEnabled, long dataSourceLastModifiedTime) {
for (HiveColumnHandle column : columns) {
checkArgument(column.getColumnType() == HiveColumnHandle.ColumnType.REGULAR || column.getHiveColumnIndex() == HiveColumnHandle.ROW_ID__COLUMN_INDEX, "column type must be regular: %s", column);
}
checkArgument(!effectivePredicate.isNone());
OrcDataSource orcDataSource;
try {
// Always create a lazy Stream. HDFS stream opened only when required.
FSDataInputStream inputStream = new FSDataInputStream(new LazyFSInputStream(() -> {
FileSystem fileSystem = hdfsEnvironment.getFileSystem(sessionUser, path, configuration);
return hdfsEnvironment.doAs(sessionUser, () -> fileSystem.open(path));
}));
orcDataSource = new HdfsOrcDataSource(new OrcDataSourceId(path.toString()), fileSize, maxMergeDistance, maxBufferSize, streamBufferSize, lazyReadSmallRanges, inputStream, stats, dataSourceLastModifiedTime);
} catch (Exception e) {
if (nullToEmpty(e.getMessage()).trim().equals("Filesystem closed") || e instanceof FileNotFoundException) {
throw new PrestoException(HIVE_CANNOT_OPEN_SPLIT, e);
}
throw new PrestoException(HIVE_CANNOT_OPEN_SPLIT, splitError(e, path, start, length), e);
}
AggregatedMemoryContext systemMemoryUsage = newSimpleAggregatedMemoryContext();
try {
OrcDataSource readerLocalDataSource = OrcReader.wrapWithCacheIfTiny(orcDataSource, tinyStripeThreshold);
OrcFileTail fileTail;
if (orcCacheProperties.isFileTailCacheEnabled()) {
try {
OrcDataSourceIdWithTimeStamp orcDataSourceIdWithTimeStamp = new OrcDataSourceIdWithTimeStamp(readerLocalDataSource.getId(), readerLocalDataSource.getLastModifiedTime());
fileTail = orcCacheStore.getFileTailCache().get(new OrcFileTailCacheKey(orcDataSourceIdWithTimeStamp), () -> OrcPageSourceFactory.createFileTail(orcDataSource));
} catch (UncheckedExecutionException | ExecutionException executionException) {
handleCacheLoadException(executionException);
log.debug(executionException.getCause(), "Error while caching the Orc file tail. Falling back to default flow");
fileTail = OrcPageSourceFactory.createFileTail(orcDataSource);
}
} else {
fileTail = OrcPageSourceFactory.createFileTail(orcDataSource);
}
OrcReader reader = new OrcReader(readerLocalDataSource, fileTail, maxMergeDistance, tinyStripeThreshold, maxReadBlockSize);
List<OrcColumn> fileColumns = reader.getRootColumn().getNestedColumns();
List<OrcColumn> fileReadColumns = isFullAcid ? new ArrayList<>(columns.size() + 5) : new ArrayList<>(columns.size());
List<Type> fileReadTypes = isFullAcid ? new ArrayList<>(columns.size() + 5) : new ArrayList<>(columns.size());
ImmutableList<String> acidColumnNames = null;
List<ColumnAdaptation> columnAdaptations = new ArrayList<>(columns.size());
// Only Hive ACID files will begin with bucket_
boolean fileNameContainsBucket = path.getName().contains("bucket");
if (isFullAcid && fileNameContainsBucket) {
// Skip the acid schema check in case of non-ACID files
acidColumnNames = ImmutableList.<String>builder().add(ACID_COLUMN_ORIGINAL_TRANSACTION, ACID_COLUMN_BUCKET, ACID_COLUMN_ROW_ID, ACID_COLUMN_CURRENT_TRANSACTION, ACID_COLUMN_OPERATION).build();
verifyAcidSchema(reader, path);
Map<String, OrcColumn> acidColumnsByName = uniqueIndex(fileColumns, orcColumn -> orcColumn.getColumnName().toLowerCase(ENGLISH));
if (AcidUtils.isDeleteDelta(path.getParent())) {
// Avoid reading column data from delete_delta files.
// Call will come here in case of Minor VACUUM where all delete_delta files are merge together.
fileColumns = ImmutableList.of();
} else {
fileColumns = ensureColumnNameConsistency(acidColumnsByName.get(ACID_COLUMN_ROW_STRUCT).getNestedColumns(), columns);
}
fileReadColumns.add(acidColumnsByName.get(ACID_COLUMN_ORIGINAL_TRANSACTION.toLowerCase(ENGLISH)));
fileReadTypes.add(BIGINT);
fileReadColumns.add(acidColumnsByName.get(ACID_COLUMN_BUCKET.toLowerCase(ENGLISH)));
fileReadTypes.add(INTEGER);
fileReadColumns.add(acidColumnsByName.get(ACID_COLUMN_ROW_ID.toLowerCase(ENGLISH)));
fileReadTypes.add(BIGINT);
fileReadColumns.add(acidColumnsByName.get(ACID_COLUMN_CURRENT_TRANSACTION.toLowerCase(ENGLISH)));
fileReadTypes.add(BIGINT);
fileReadColumns.add(acidColumnsByName.get(ACID_COLUMN_OPERATION.toLowerCase(ENGLISH)));
fileReadTypes.add(INTEGER);
}
Map<String, OrcColumn> fileColumnsByName = ImmutableMap.of();
if (useOrcColumnNames || isFullAcid) {
verifyFileHasColumnNames(fileColumns, path);
// Convert column names read from ORC files to lower case to be consistent with those stored in Hive Metastore
fileColumnsByName = uniqueIndex(fileColumns, orcColumn -> orcColumn.getColumnName().toLowerCase(ENGLISH));
}
TupleDomainOrcPredicateBuilder predicateBuilder = TupleDomainOrcPredicate.builder().setBloomFiltersEnabled(orcBloomFiltersEnabled);
Map<HiveColumnHandle, Domain> effectivePredicateDomains = effectivePredicate.getDomains().orElseThrow(() -> new IllegalArgumentException("Effective predicate is none"));
for (HiveColumnHandle column : columns) {
OrcColumn orcColumn = null;
if (useOrcColumnNames || isFullAcid) {
orcColumn = fileColumnsByName.get(column.getName());
} else if (column.getHiveColumnIndex() >= 0 && column.getHiveColumnIndex() < fileColumns.size()) {
orcColumn = fileColumns.get(column.getHiveColumnIndex());
}
Type readType = typeManager.getType(column.getTypeSignature());
if (orcColumn != null) {
int sourceIndex = fileReadColumns.size();
columnAdaptations.add(ColumnAdaptation.sourceColumn(sourceIndex));
fileReadColumns.add(orcColumn);
fileReadTypes.add(readType);
Domain domain = effectivePredicateDomains.get(column);
if (domain != null) {
predicateBuilder.addColumn(orcColumn.getColumnId(), domain);
}
} else if (isFullAcid && readType instanceof RowType && column.getName().equalsIgnoreCase(HiveColumnHandle.UPDATE_ROW_ID_COLUMN_NAME)) {
HiveType hiveType = column.getHiveType();
StructTypeInfo structTypeInfo = (StructTypeInfo) hiveType.getTypeInfo();
ImmutableList.Builder<ColumnAdaptation> builder = new ImmutableList.Builder<>();
ArrayList<String> fieldNames = structTypeInfo.getAllStructFieldNames();
List<ColumnAdaptation> adaptations = fieldNames.stream().map(acidColumnNames::indexOf).map(c -> ColumnAdaptation.sourceColumn(c, false)).collect(Collectors.toList());
columnAdaptations.add(ColumnAdaptation.structColumn(structTypeInfo, adaptations));
} else {
columnAdaptations.add(ColumnAdaptation.nullColumn(readType));
}
}
Map<String, Domain> domains = effectivePredicate.getDomains().get().entrySet().stream().collect(toMap(e -> e.getKey().getName(), Map.Entry::getValue));
OrcRecordReader recordReader = reader.createRecordReader(fileReadColumns, fileReadTypes, predicateBuilder.build(), start, length, legacyFileTimeZone, systemMemoryUsage, INITIAL_BATCH_SIZE, exception -> handleException(orcDataSource.getId(), exception), indexes, splitMetadata, domains, orcCacheStore, orcCacheProperties, pageMetadataEnabled);
OrcDeletedRows deletedRows = new OrcDeletedRows(path.getName(), deleteDeltaLocations, new OrcDeleteDeltaPageSourceFactory(sessionUser, configuration, hdfsEnvironment, maxMergeDistance, maxBufferSize, streamBufferSize, maxReadBlockSize, tinyStripeThreshold, lazyReadSmallRanges, orcBloomFiltersEnabled, stats), sessionUser, configuration, hdfsEnvironment, startRowOffsetOfFile);
boolean eagerload = false;
if (indexes.isPresent()) {
eagerload = indexes.get().stream().anyMatch(indexMetadata -> EAGER_LOAD_INDEX_ID.contains(indexMetadata.getIndex().getId()));
}
return new OrcPageSource(recordReader, columnAdaptations, orcDataSource, deletedRows, eagerload, systemMemoryUsage, stats);
} catch (Exception e) {
try {
orcDataSource.close();
} catch (IOException ignored) {
}
if (e instanceof PrestoException) {
throw (PrestoException) e;
}
String message = splitError(e, path, start, length);
if (e instanceof BlockMissingException) {
throw new PrestoException(HIVE_MISSING_DATA, message, e);
}
throw new PrestoException(HIVE_CANNOT_OPEN_SPLIT, message, e);
}
}
Aggregations