Examples with BloomFilter - org.apache.drill.exec.work.filter.BloomFilter

Example 1 with BloomFilter

use of org.apache.drill.exec.work.filter.BloomFilter in project drill by apache.

the class RuntimeFilterRecordBatch method applyRuntimeFilter.

/**
 * If RuntimeFilter is available then applies the filter condition on the
 * incoming batch records and creates an SV2 to store indexes which passes the
 * filter condition. In case when RuntimeFilter is not available it just pass
 * through all the records from incoming batch to downstream.
 */
private void applyRuntimeFilter() {
    if (originalRecordCount <= 0) {
        sv2.setRecordCount(0);
        return;
    }
    current = context.getRuntimeFilter(rfIdentifier);
    timedWaiting();
    batchTimes++;
    sv2.allocateNew(originalRecordCount);
    if (current == null) {
        // means none of the rows are filtered out hence set all the indexes
        for (int i = 0; i < originalRecordCount; ++i) {
            sv2.setIndex(i, i);
        }
        sv2.setRecordCount(originalRecordCount);
        return;
    }
    // Setup a hash helper if needed
    setupHashHelper();
    // To make each independent bloom filter work together to construct a final filter result: BitSet.
    BitSet bitSet = new BitSet(originalRecordCount);
    int filterSize = toFilterFields.size();
    int svIndex = 0;
    if (filterSize == 1) {
        BloomFilter bloomFilter = bloomFilters.get(0);
        String fieldName = toFilterFields.get(0);
        int fieldId = field2id.get(fieldName);
        for (int rowIndex = 0; rowIndex < originalRecordCount; rowIndex++) {
            long hash;
            try {
                hash = hash64.hash64Code(rowIndex, 0, fieldId);
            } catch (SchemaChangeException e) {
                throw new UnsupportedOperationException(e);
            }
            boolean contain = bloomFilter.find(hash);
            if (contain) {
                sv2.setIndex(svIndex, rowIndex);
                svIndex++;
            } else {
                filteredRows++;
            }
        }
    } else {
        for (int i = 0; i < toFilterFields.size(); i++) {
            BloomFilter bloomFilter = bloomFilters.get(i);
            String fieldName = toFilterFields.get(i);
            try {
                computeBitSet(field2id.get(fieldName), bloomFilter, bitSet);
            } catch (SchemaChangeException e) {
                throw new UnsupportedOperationException(e);
            }
        }
        for (int i = 0; i < originalRecordCount; i++) {
            boolean contain = bitSet.get(i);
            if (contain) {
                sv2.setIndex(svIndex, i);
                svIndex++;
            } else {
                filteredRows++;
            }
        }
    }
    appliedTimes++;
    sv2.setRecordCount(svIndex);
}

Also used : SchemaChangeException(org.apache.drill.exec.exception.SchemaChangeException) BitSet(java.util.BitSet) BloomFilter(org.apache.drill.exec.work.filter.BloomFilter)

Example 2 with BloomFilter

use of org.apache.drill.exec.work.filter.BloomFilter in project drill by apache.

the class HashJoinBatch method executeBuildPhase.

/**
 * Execute the BUILD phase; first read incoming and split rows into
 * partitions; may decide to spill some of the partitions
 *
 * @return Returns an
 *         {@link org.apache.drill.exec.record.RecordBatch.IterOutcome} if a
 *         termination condition is reached. Otherwise returns null.
 * @throws SchemaChangeException
 */
public IterOutcome executeBuildPhase() throws SchemaChangeException {
    if (buildSideIsEmpty.booleanValue()) {
        // empty right
        return null;
    }
    if (skipHashTableBuild) {
        // No hash table needed - then consume all the
        // right upstream
        killAndDrainRightUpstream();
        return null;
    }
    HashJoinMemoryCalculator.BuildSidePartitioning buildCalc;
    {
        // Initializing build calculator
        // Limit scope of these variables to this block
        int maxBatchSize = spilledState.isFirstCycle() ? RecordBatch.MAX_BATCH_ROW_COUNT : RECORDS_PER_BATCH;
        boolean doMemoryCalculation = canSpill && !probeSideIsEmpty.booleanValue();
        HashJoinMemoryCalculator calc = getCalculatorImpl();
        calc.initialize(doMemoryCalculation);
        buildCalc = calc.next();
        // TODO Fix after
        buildCalc.initialize(// TODO Fix after
        spilledState.isFirstCycle(), // TODO Fix after
        true, // fixed
        buildBatch, probeBatch, buildJoinColumns, probeSideIsEmpty.booleanValue(), allocator.getLimit(), numPartitions, RECORDS_PER_BATCH, RECORDS_PER_BATCH, maxBatchSize, maxBatchSize, batchMemoryManager.getOutputBatchSize(), HashTable.DEFAULT_LOAD_FACTOR);
        if (spilledState.isFirstCycle() && doMemoryCalculation) {
            // Do auto tuning
            buildCalc = partitionNumTuning(maxBatchSize, buildCalc);
        }
    }
    if (spilledState.isFirstCycle()) {
        // Do initial setup only on the first cycle
        delayedSetup();
    }
    initializeBuild();
    initializeRuntimeFilter();
    // Make the calculator aware of our partitions
    HashJoinMemoryCalculator.PartitionStatSet partitionStatSet = new HashJoinMemoryCalculator.PartitionStatSet(partitions);
    buildCalc.setPartitionStatSet(partitionStatSet);
    boolean moreData = true;
    while (moreData) {
        switch(rightUpstream) {
            case NONE:
            case NOT_YET:
                moreData = false;
                continue;
            case OK_NEW_SCHEMA:
                if (!buildSchema.equals(buildBatch.getSchema())) {
                    throw SchemaChangeException.schemaChanged("Hash join does not support schema changes in build side.", buildSchema, buildBatch.getSchema());
                }
                for (HashPartition partn : partitions) {
                    partn.updateBatches();
                }
            // Fall through
            case OK:
                batchMemoryManager.update(buildBatch, RIGHT_INDEX, 0, true);
                int currentRecordCount = buildBatch.getRecordCount();
                // create runtime filter
                if (spilledState.isFirstCycle() && enableRuntimeFilter) {
                    // create runtime filter and send out async
                    for (BloomFilter bloomFilter : bloomFilter2buildId.keySet()) {
                        int fieldId = bloomFilter2buildId.get(bloomFilter);
                        for (int ind = 0; ind < currentRecordCount; ind++) {
                            long hashCode = hash64.hash64Code(ind, 0, fieldId);
                            bloomFilter.insert(hashCode);
                        }
                    }
                }
                // incoming vectors as they are (no row copy)
                if (numPartitions == 1) {
                    partitions[0].appendBatch(buildBatch);
                    break;
                }
                if (!spilledState.isFirstCycle()) {
                    read_right_HV_vector = (IntVector) buildBatch.getContainer().getLast();
                }
                // the result
                for (int ind = 0; ind < currentRecordCount; ind++) {
                    int hashCode = spilledState.isFirstCycle() ? partitions[0].getBuildHashCode(ind) : // get the hash
                    read_right_HV_vector.getAccessor().get(ind);
                    // value from the
                    // HV column
                    int currPart = hashCode & spilledState.getPartitionMask();
                    hashCode >>>= spilledState.getBitsInMask();
                    // semi-join skips join-key-duplicate rows
                    if (semiJoin) {
                    }
                    // Append the new inner row to the appropriate partition; spill (that
                    // partition) if needed
                    partitions[currPart].appendInnerRow(buildBatch.getContainer(), ind, hashCode, buildCalc);
                }
                if (read_right_HV_vector != null) {
                    read_right_HV_vector.clear();
                    read_right_HV_vector = null;
                }
                break;
            default:
                throw new IllegalStateException(rightUpstream.name());
        }
        // Get the next incoming record batch
        rightUpstream = next(HashJoinHelper.RIGHT_INPUT, buildBatch);
    }
    if (spilledState.isFirstCycle() && enableRuntimeFilter) {
        if (bloomFilter2buildId.size() > 0) {
            int hashJoinOpId = this.popConfig.getOperatorId();
            runtimeFilterReporter.sendOut(bloomFilters, probeFields, this.popConfig.getRuntimeFilterDef(), hashJoinOpId);
        }
    }
    // the spilled partitions list
    if (numPartitions > 1) {
        // a single partition needs no completion
        for (HashPartition partn : partitions) {
            partn.completeAnInnerBatch(false, partn.isSpilled());
        }
    }
    prefetchFirstProbeBatch();
    if (leftUpstream.isError()) {
        // We need to terminate.
        return leftUpstream;
    }
    HashJoinMemoryCalculator.PostBuildCalculations postBuildCalc = buildCalc.next();
    // probeEmpty
    postBuildCalc.initialize(probeSideIsEmpty.booleanValue());
    for (int index = 0; index < partitions.length; index++) {
        HashPartition partn = partitions[index];
        if (partn.isSpilled()) {
            // Don't build hash tables for spilled partitions
            continue;
        }
        try {
            if (postBuildCalc.shouldSpill()) {
                // Spill this partition if we need to make room
                partn.spillThisPartition();
            } else {
                // Only build hash tables for partitions that are not spilled
                partn.buildContainersHashTableAndHelper();
            }
        } catch (OutOfMemoryException e) {
            String message = "Failed building hash table on partition " + index + ":\n" + makeDebugString() + "\n" + postBuildCalc.makeDebugString();
            // Include debug info
            throw new OutOfMemoryException(message, e);
        }
    }
    if (logger.isDebugEnabled()) {
        logger.debug(postBuildCalc.makeDebugString());
    }
    for (HashPartition partn : partitions) {
        if (partn.isSpilled()) {
            HashJoinSpilledPartition sp = new HashJoinSpilledPartition(spilledState.getCycle(), partn.getPartitionNum(), originalPartition, partn.getPartitionBatchesCount(), partn.getSpillFile());
            spilledState.addPartition(sp);
            // for the outer to find
            spilledInners[partn.getPartitionNum()] = sp;
            // the SP later
            partn.closeWriter();
            partn.updateProbeRecordsPerBatch(postBuildCalc.getProbeRecordsPerBatch());
        }
    }
    return null;
}

Also used : BloomFilter(org.apache.drill.exec.work.filter.BloomFilter) HashPartition(org.apache.drill.exec.physical.impl.common.HashPartition) OutOfMemoryException(org.apache.drill.exec.exception.OutOfMemoryException)

Example 3 with BloomFilter

use of org.apache.drill.exec.work.filter.BloomFilter in project drill by apache.

the class HashJoinBatch method initializeRuntimeFilter.

/**
 * Note: This method can not be called again as part of recursive call of
 * executeBuildPhase() to handle spilled build partitions.
 */
private void initializeRuntimeFilter() {
    if (!enableRuntimeFilter || bloomFiltersGenerated) {
        return;
    }
    runtimeFilterReporter = new RuntimeFilterReporter((ExecutorFragmentContext) context);
    RuntimeFilterDef runtimeFilterDef = popConfig.getRuntimeFilterDef();
    // RuntimeFilterRouter's judgement will have the RuntimeFilterDef.
    if (runtimeFilterDef != null) {
        List<BloomFilterDef> bloomFilterDefs = runtimeFilterDef.getBloomFilterDefs();
        for (BloomFilterDef bloomFilterDef : bloomFilterDefs) {
            int buildFieldId = bloomFilterDef2buildId.get(bloomFilterDef);
            int numBytes = bloomFilterDef.getNumBytes();
            String probeField = bloomFilterDef.getProbeField();
            probeFields.add(probeField);
            BloomFilter bloomFilter = new BloomFilter(numBytes, context.getAllocator());
            bloomFilters.add(bloomFilter);
            bloomFilter2buildId.put(bloomFilter, buildFieldId);
        }
    }
    bloomFiltersGenerated = true;
}

Also used : ExecutorFragmentContext(org.apache.drill.exec.ops.ExecutorFragmentContext) BloomFilterDef(org.apache.drill.exec.work.filter.BloomFilterDef) RuntimeFilterReporter(org.apache.drill.exec.work.filter.RuntimeFilterReporter) RuntimeFilterDef(org.apache.drill.exec.work.filter.RuntimeFilterDef) BloomFilter(org.apache.drill.exec.work.filter.BloomFilter)

Aggregations

BloomFilter (org.apache.drill.exec.work.filter.BloomFilter)3 BitSet (java.util.BitSet)1 OutOfMemoryException (org.apache.drill.exec.exception.OutOfMemoryException)1 SchemaChangeException (org.apache.drill.exec.exception.SchemaChangeException)1 ExecutorFragmentContext (org.apache.drill.exec.ops.ExecutorFragmentContext)1 HashPartition (org.apache.drill.exec.physical.impl.common.HashPartition)1 BloomFilterDef (org.apache.drill.exec.work.filter.BloomFilterDef)1 RuntimeFilterDef (org.apache.drill.exec.work.filter.RuntimeFilterDef)1 RuntimeFilterReporter (org.apache.drill.exec.work.filter.RuntimeFilterReporter)1