use of io.prestosql.spi.dynamicfilter.DynamicFilter in project hetu-core by openlookeng.
the class HivePageSourceProvider method createPageSourceInternal.
private ConnectorPageSource createPageSourceInternal(ConnectorSession session, Optional<DynamicFilterSupplier> dynamicFilterSupplier, List<Map<ColumnHandle, DynamicFilter>> dynamicFilters, HiveTableHandle hiveTable, List<HiveColumnHandle> hiveColumns, HiveSplit hiveSplit) {
Path path = new Path(hiveSplit.getPath());
List<Set<DynamicFilter>> dynamicFilterList = new ArrayList();
if (dynamicFilters != null) {
for (Map<ColumnHandle, DynamicFilter> df : dynamicFilters) {
Set<DynamicFilter> values = df.values().stream().collect(Collectors.toSet());
dynamicFilterList.add(values);
}
}
// Filter out splits using partition values and dynamic filters
if (dynamicFilters != null && !dynamicFilters.isEmpty() && isPartitionFiltered(hiveSplit.getPartitionKeys(), dynamicFilterList, typeManager)) {
return new FixedPageSource(ImmutableList.of());
}
Configuration configuration = hdfsEnvironment.getConfiguration(new HdfsEnvironment.HdfsContext(session, hiveSplit.getDatabase(), hiveSplit.getTable()), path);
Properties schema = hiveSplit.getSchema();
String columnNameDelimiter = schema.containsKey(serdeConstants.COLUMN_NAME_DELIMITER) ? schema.getProperty(serdeConstants.COLUMN_NAME_DELIMITER) : String.valueOf(SerDeUtils.COMMA);
List<String> partitionColumnNames;
if (schema.containsKey(META_PARTITION_COLUMNS)) {
partitionColumnNames = Arrays.asList(schema.getProperty(META_PARTITION_COLUMNS).split(columnNameDelimiter));
} else if (schema.containsKey(META_TABLE_COLUMNS)) {
partitionColumnNames = Arrays.asList(schema.getProperty(META_TABLE_COLUMNS).split(columnNameDelimiter));
} else {
partitionColumnNames = new ArrayList<>();
}
List<String> tableColumns = hiveColumns.stream().map(cols -> cols.getName()).collect(toList());
List<String> missingColumns = tableColumns.stream().skip(partitionColumnNames.size()).collect(toList());
List<IndexMetadata> indexes = new ArrayList<>();
if (indexCache != null && session.isHeuristicIndexFilterEnabled()) {
indexes.addAll(this.indexCache.getIndices(session.getCatalog().orElse(null), hiveTable.getSchemaTableName().toString(), hiveSplit, hiveTable.getCompactEffectivePredicate(), hiveTable.getPartitionColumns()));
/* Bloom/Bitmap indices are checked for given table and added to the possible matchers for pushdown. */
if (hiveTable.getDisjunctCompactEffectivePredicate().isPresent() && hiveTable.getDisjunctCompactEffectivePredicate().get().size() > 0) {
hiveTable.getDisjunctCompactEffectivePredicate().get().forEach(orPredicate -> indexes.addAll(this.indexCache.getIndices(session.getCatalog().orElse(null), hiveTable.getSchemaTableName().toString(), hiveSplit, orPredicate, hiveTable.getPartitionColumns())));
}
}
Optional<List<IndexMetadata>> indexOptional = indexes == null || indexes.isEmpty() ? Optional.empty() : Optional.of(indexes);
URI splitUri = URI.create(URIUtil.encodePath(hiveSplit.getPath()));
SplitMetadata splitMetadata = new SplitMetadata(splitUri.getRawPath(), hiveSplit.getLastModifiedTime());
TupleDomain<HiveColumnHandle> predicate = TupleDomain.all();
if (dynamicFilterSupplier.isPresent() && dynamicFilters != null && !dynamicFilters.isEmpty()) {
if (dynamicFilters.size() == 1) {
List<HiveColumnHandle> filteredHiveColumnHandles = hiveColumns.stream().filter(column -> dynamicFilters.get(0).containsKey(column)).collect(toList());
HiveColumnHandle hiveColumnHandle = filteredHiveColumnHandles.get(0);
Type type = hiveColumnHandle.getColumnMetadata(typeManager).getType();
predicate = getPredicate(dynamicFilters.get(0).get(hiveColumnHandle), type, hiveColumnHandle);
if (predicate.isNone()) {
predicate = TupleDomain.all();
}
}
}
/**
* This is main logical division point to process filter pushdown enabled case (aka as selective read flow).
* If user configuration orc_predicate_pushdown_enabled is true and if all clause of query can be handled by hive
* selective read flow, then hiveTable.isSuitableToPush() will be enabled.
* (Refer HiveMetadata.checkIfSuitableToPush).
*/
if (hiveTable.isSuitableToPush()) {
return createSelectivePageSource(selectivePageSourceFactories, configuration, session, hiveSplit, assignUniqueIndicesToPartitionColumns(hiveColumns), typeManager, dynamicFilterSupplier, hiveSplit.getDeleteDeltaLocations(), hiveSplit.getStartRowOffsetOfFile(), indexOptional, hiveSplit.isCacheable(), hiveTable.getCompactEffectivePredicate(), hiveTable.getPredicateColumns(), hiveTable.getDisjunctCompactEffectivePredicate(), hiveSplit.getBucketConversion(), hiveSplit.getBucketNumber(), hiveSplit.getLastModifiedTime(), missingColumns);
}
Optional<ConnectorPageSource> pageSource = createHivePageSource(cursorProviders, pageSourceFactories, configuration, session, path, hiveSplit.getBucketNumber(), hiveSplit.getStart(), hiveSplit.getLength(), hiveSplit.getFileSize(), hiveSplit.getSchema(), hiveTable.getCompactEffectivePredicate().intersect(predicate), hiveColumns, hiveSplit.getPartitionKeys(), typeManager, hiveSplit.getColumnCoercions(), hiveSplit.getBucketConversion(), hiveSplit.isS3SelectPushdownEnabled(), dynamicFilterSupplier, hiveSplit.getDeleteDeltaLocations(), hiveSplit.getStartRowOffsetOfFile(), indexOptional, splitMetadata, hiveSplit.isCacheable(), hiveSplit.getLastModifiedTime(), hiveSplit.getCustomSplitInfo(), missingColumns);
if (pageSource.isPresent()) {
return pageSource.get();
}
throw new RuntimeException("Could not find a file reader for split " + hiveSplit);
}
use of io.prestosql.spi.dynamicfilter.DynamicFilter in project hetu-core by openlookeng.
the class HiveSplitManager method getSplits.
@Override
public ConnectorSplitSource getSplits(ConnectorTransactionHandle transaction, ConnectorSession session, ConnectorTableHandle tableHandle, SplitSchedulingStrategy splitSchedulingStrategy, Supplier<List<Set<DynamicFilter>>> dynamicFilterSupplier, Optional<QueryType> queryType, Map<String, Object> queryInfo, Set<TupleDomain<ColumnMetadata>> userDefinedCachePredicates, boolean partOfReuse) {
HiveTableHandle hiveTable = (HiveTableHandle) tableHandle;
SchemaTableName tableName = hiveTable.getSchemaTableName();
// get table metadata
SemiTransactionalHiveMetastore metastore = metastoreProvider.apply((HiveTransactionHandle) transaction);
Table table = metastore.getTable(new HiveIdentity(session), tableName.getSchemaName(), tableName.getTableName()).orElseThrow(() -> new TableNotFoundException(tableName));
if (table.getStorage().getStorageFormat().getInputFormat().contains("carbon")) {
throw new PrestoException(NOT_SUPPORTED, "Hive connector can't read carbondata tables");
}
// verify table is not marked as non-readable
String tableNotReadable = table.getParameters().get(OBJECT_NOT_READABLE);
if (!isNullOrEmpty(tableNotReadable)) {
throw new HiveNotReadableException(tableName, Optional.empty(), tableNotReadable);
}
// get partitions
List<HivePartition> partitions = partitionManager.getOrLoadPartitions(session, metastore, new HiveIdentity(session), hiveTable);
// short circuit if we don't have any partitions
if (partitions.isEmpty()) {
return new FixedSplitSource(ImmutableList.of());
}
// get buckets from first partition (arbitrary)
Optional<HiveBucketing.HiveBucketFilter> bucketFilter = hiveTable.getBucketFilter();
// validate bucket bucketed execution
Optional<HiveBucketHandle> bucketHandle = hiveTable.getBucketHandle();
if ((splitSchedulingStrategy == GROUPED_SCHEDULING) && !bucketHandle.isPresent()) {
throw new PrestoException(GENERIC_INTERNAL_ERROR, "SchedulingPolicy is bucketed, but BucketHandle is not present");
}
// sort partitions
partitions = Ordering.natural().onResultOf(HivePartition::getPartitionId).reverse().sortedCopy(partitions);
Iterable<HivePartitionMetadata> hivePartitions = getPartitionMetadata(session, metastore, table, tableName, partitions, bucketHandle.map(HiveBucketHandle::toTableBucketProperty));
HiveSplitLoader hiveSplitLoader = new BackgroundHiveSplitLoader(table, hivePartitions, hiveTable.getCompactEffectivePredicate(), BackgroundHiveSplitLoader.BucketSplitInfo.createBucketSplitInfo(bucketHandle, bucketFilter), session, hdfsEnvironment, namenodeStats, directoryLister, executor, splitLoaderConcurrency, recursiveDfsWalkerEnabled, metastore.getValidWriteIds(session, hiveTable, queryType.map(t -> t == QueryType.VACUUM).orElse(false)).map(validTxnWriteIdList -> validTxnWriteIdList.getTableValidWriteIdList(table.getDatabaseName() + "." + table.getTableName())), dynamicFilterSupplier, queryType, queryInfo, typeManager);
HiveSplitSource splitSource;
HiveStorageFormat hiveStorageFormat = HiveMetadata.extractHiveStorageFormat(table);
switch(splitSchedulingStrategy) {
case UNGROUPED_SCHEDULING:
splitSource = HiveSplitSource.allAtOnce(session, table.getDatabaseName(), table.getTableName(), // For reuse, we should make sure to have same split size all time for a table.
partOfReuse ? 0 : maxInitialSplits, maxOutstandingSplits, maxOutstandingSplitsSize, maxSplitsPerSecond, hiveSplitLoader, executor, new CounterStat(), dynamicFilterSupplier, userDefinedCachePredicates, typeManager, hiveConfig, hiveStorageFormat);
break;
case GROUPED_SCHEDULING:
splitSource = HiveSplitSource.bucketed(session, table.getDatabaseName(), table.getTableName(), // For reuse, we should make sure to have same split size all time for a table.
partOfReuse ? 0 : maxInitialSplits, maxOutstandingSplits, maxOutstandingSplitsSize, maxSplitsPerSecond, hiveSplitLoader, executor, new CounterStat(), dynamicFilterSupplier, userDefinedCachePredicates, typeManager, hiveConfig, hiveStorageFormat);
break;
default:
throw new IllegalArgumentException("Unknown splitSchedulingStrategy: " + splitSchedulingStrategy);
}
hiveSplitLoader.start(splitSource);
if (queryType.isPresent() && queryType.get() == QueryType.VACUUM) {
HdfsContext hdfsContext = new HdfsContext(session, table.getDatabaseName(), table.getTableName());
return new HiveVacuumSplitSource(splitSource, (HiveVacuumTableHandle) queryInfo.get("vacuumHandle"), hdfsEnvironment, hdfsContext, session);
}
return splitSource;
}
use of io.prestosql.spi.dynamicfilter.DynamicFilter in project hetu-core by openlookeng.
the class HivePageSourceProvider method getPredicate.
private static TupleDomain<HiveColumnHandle> getPredicate(DynamicFilter dynamicFilter, Type type, HiveColumnHandle hiveColumnHandle) {
if (dynamicFilter instanceof CombinedDynamicFilter) {
List<DynamicFilter> filters = ((CombinedDynamicFilter) dynamicFilter).getFilters();
List<TupleDomain<HiveColumnHandle>> predicates = filters.stream().map(filter -> getPredicate(filter, type, hiveColumnHandle)).collect(toList());
return predicates.stream().reduce(TupleDomain.all(), TupleDomain::intersect);
}
if (dynamicFilter instanceof FilteredDynamicFilter && !((FilteredDynamicFilter) dynamicFilter).getSetValues().isEmpty()) {
Domain domain = Domain.create(ValueSet.copyOf(type, ((FilteredDynamicFilter) dynamicFilter).getSetValues()), false);
domain = modifyDomain(domain, ((FilteredDynamicFilter) dynamicFilter).getFilterExpression());
return TupleDomain.withColumnDomains(ImmutableMap.of(hiveColumnHandle, domain));
}
return TupleDomain.all();
}
use of io.prestosql.spi.dynamicfilter.DynamicFilter in project hetu-core by openlookeng.
the class DynamicFilterService method mergeDynamicFilters.
/**
* Global Dynamic Filter merging, periodically looks for dynamic filters that can be merged and merges them
*/
private void mergeDynamicFilters() {
final StateStore stateStore = stateStoreProvider.getStateStore();
for (Map.Entry<String, Map<String, DynamicFilterRegistryInfo>> queryToDynamicFiltersEntry : dynamicFilters.entrySet()) {
final String queryId = queryToDynamicFiltersEntry.getKey();
if (!cachedDynamicFilters.containsKey(queryId)) {
cachedDynamicFilters.put(queryId, new ConcurrentHashMap<>());
}
Map<String, DynamicFilter> cachedDynamicFiltersForQuery = cachedDynamicFilters.get(queryId);
StateMap mergedDynamicFilters = (StateMap) stateStore.getOrCreateStateCollection(DynamicFilterUtils.MERGED_DYNAMIC_FILTERS, MAP);
for (Map.Entry<String, DynamicFilterRegistryInfo> columnToDynamicFilterEntry : queryToDynamicFiltersEntry.getValue().entrySet()) {
if (columnToDynamicFilterEntry.getValue().isMerged()) {
continue;
}
final String filterId = columnToDynamicFilterEntry.getKey();
final Type filterType = columnToDynamicFilterEntry.getValue().getType();
final DataType filterDataType = columnToDynamicFilterEntry.getValue().getDataType();
final Optional<Predicate<List>> dfFilter = columnToDynamicFilterEntry.getValue().getFilter();
final Symbol column = columnToDynamicFilterEntry.getValue().getSymbol();
final String filterKey = createKey(DynamicFilterUtils.FILTERPREFIX, filterId, queryId);
if (!hasMergeCondition(filterId, queryId)) {
continue;
}
Collection<Object> results = ((StateSet) stateStore.getStateCollection(createKey(DynamicFilterUtils.PARTIALPREFIX, filterId, queryId))).getAll();
try {
DynamicFilter mergedFilter;
if (filterDataType == BLOOM_FILTER) {
BloomFilter mergedBloomFilter = mergeBloomFilters(results);
if (mergedBloomFilter.expectedFpp() > DynamicFilterUtils.BLOOM_FILTER_EXPECTED_FPP) {
throw new PrestoException(GENERIC_INTERNAL_ERROR, "FPP too high: " + mergedBloomFilter.approximateElementCount());
}
mergedFilter = new BloomFilterDynamicFilter(filterKey, null, mergedBloomFilter, filterType);
if (filterType == GLOBAL) {
try (ByteArrayOutputStream out = new ByteArrayOutputStream()) {
mergedBloomFilter.writeTo(out);
byte[] filter = out.toByteArray();
mergedDynamicFilters.put(filterKey, filter);
}
}
} else if (filterDataType == HASHSET) {
Set mergedSet = mergeHashSets(results);
mergedFilter = DynamicFilterFactory.create(filterKey, null, mergedSet, filterType, dfFilter, Optional.empty());
if (filterType == GLOBAL) {
mergedDynamicFilters.put(filterKey, mergedSet);
}
} else {
throw new PrestoException(GENERIC_INTERNAL_ERROR, "Unsupported filter data type: " + filterDataType);
}
log.debug("Merged successfully dynamic filter id: " + filterId + "-" + queryId + " type: " + filterDataType + ", column: " + column + ", item count: " + mergedFilter.getSize());
cachedDynamicFiltersForQuery.put(filterId, mergedFilter);
} catch (IOException | PrestoException e) {
log.warn("Could not merge dynamic filter: " + e.getLocalizedMessage());
} finally {
// for each dynamic filter we only try to merge it once
columnToDynamicFilterEntry.getValue().setMerged();
}
}
}
}
use of io.prestosql.spi.dynamicfilter.DynamicFilter in project hetu-core by openlookeng.
the class TestHivePageSource method testFilterRows.
@Test(dataProvider = "data")
public void testFilterRows(int columnOffset1, int columnOffset2, int expectedPositionCount, String message) {
final Type[] types = new Type[] { BigintType.BIGINT, BigintType.BIGINT };
final int numValues = 1024;
BlockBuilder builder = new LongArrayBlockBuilder(null, numValues);
for (int i = 0; i < numValues; i++) {
builder.writeLong(i);
}
Block dayBlock = builder.build();
builder = new LongArrayBlockBuilder(null, numValues);
for (int i = 0; i < numValues; i++) {
builder.writeLong(10000 + i);
}
Block appBlock = builder.build();
Page page = new Page(dayBlock, appBlock);
Map<ColumnHandle, DynamicFilter> dynamicFilter = new HashMap<>();
ColumnHandle dayColumn = new HiveColumnHandle("pt_d", HIVE_INT, parseTypeSignature(INTEGER), 0, REGULAR, Optional.empty());
ColumnHandle appColumn = new HiveColumnHandle("app_d", HIVE_INT, parseTypeSignature(INTEGER), 1, REGULAR, Optional.empty());
BloomFilter dayFilter = new BloomFilter(1024 * 1024, 0.01);
BloomFilter appFilter = new BloomFilter(1024 * 1024, 0.01);
for (int i = 0; i < 10; i++) {
dayFilter.add(columnOffset1 + i);
appFilter.add(columnOffset2 + i);
}
dynamicFilter.put(dayColumn, new BloomFilterDynamicFilter("1", dayColumn, dayFilter, DynamicFilter.Type.GLOBAL));
dynamicFilter.put(appColumn, new BloomFilterDynamicFilter("2", appColumn, appFilter, DynamicFilter.Type.GLOBAL));
List<Map<ColumnHandle, DynamicFilter>> dynamicFilters = new ArrayList<>();
dynamicFilters.add(dynamicFilter);
List<Map<Integer, ColumnHandle>> eligibleColumns = ImmutableList.of(ImmutableMap.of(0, dayColumn, 1, appColumn));
Page filteredPage = filter(dynamicFilters, page, eligibleColumns, types);
assertEquals(filteredPage.getPositionCount(), expectedPositionCount, message);
}
Aggregations