use of org.apache.drill.exec.work.filter.BloomFilter in project drill by apache.
the class RuntimeFilterRecordBatch method applyRuntimeFilter.
/**
* If RuntimeFilter is available then applies the filter condition on the
* incoming batch records and creates an SV2 to store indexes which passes the
* filter condition. In case when RuntimeFilter is not available it just pass
* through all the records from incoming batch to downstream.
*/
private void applyRuntimeFilter() {
if (originalRecordCount <= 0) {
sv2.setRecordCount(0);
return;
}
current = context.getRuntimeFilter(rfIdentifier);
timedWaiting();
batchTimes++;
sv2.allocateNew(originalRecordCount);
if (current == null) {
// means none of the rows are filtered out hence set all the indexes
for (int i = 0; i < originalRecordCount; ++i) {
sv2.setIndex(i, i);
}
sv2.setRecordCount(originalRecordCount);
return;
}
// Setup a hash helper if needed
setupHashHelper();
// To make each independent bloom filter work together to construct a final filter result: BitSet.
BitSet bitSet = new BitSet(originalRecordCount);
int filterSize = toFilterFields.size();
int svIndex = 0;
if (filterSize == 1) {
BloomFilter bloomFilter = bloomFilters.get(0);
String fieldName = toFilterFields.get(0);
int fieldId = field2id.get(fieldName);
for (int rowIndex = 0; rowIndex < originalRecordCount; rowIndex++) {
long hash;
try {
hash = hash64.hash64Code(rowIndex, 0, fieldId);
} catch (SchemaChangeException e) {
throw new UnsupportedOperationException(e);
}
boolean contain = bloomFilter.find(hash);
if (contain) {
sv2.setIndex(svIndex, rowIndex);
svIndex++;
} else {
filteredRows++;
}
}
} else {
for (int i = 0; i < toFilterFields.size(); i++) {
BloomFilter bloomFilter = bloomFilters.get(i);
String fieldName = toFilterFields.get(i);
try {
computeBitSet(field2id.get(fieldName), bloomFilter, bitSet);
} catch (SchemaChangeException e) {
throw new UnsupportedOperationException(e);
}
}
for (int i = 0; i < originalRecordCount; i++) {
boolean contain = bitSet.get(i);
if (contain) {
sv2.setIndex(svIndex, i);
svIndex++;
} else {
filteredRows++;
}
}
}
appliedTimes++;
sv2.setRecordCount(svIndex);
}
use of org.apache.drill.exec.work.filter.BloomFilter in project drill by apache.
the class HashJoinBatch method executeBuildPhase.
/**
* Execute the BUILD phase; first read incoming and split rows into
* partitions; may decide to spill some of the partitions
*
* @return Returns an
* {@link org.apache.drill.exec.record.RecordBatch.IterOutcome} if a
* termination condition is reached. Otherwise returns null.
* @throws SchemaChangeException
*/
public IterOutcome executeBuildPhase() throws SchemaChangeException {
if (buildSideIsEmpty.booleanValue()) {
// empty right
return null;
}
if (skipHashTableBuild) {
// No hash table needed - then consume all the
// right upstream
killAndDrainRightUpstream();
return null;
}
HashJoinMemoryCalculator.BuildSidePartitioning buildCalc;
{
// Initializing build calculator
// Limit scope of these variables to this block
int maxBatchSize = spilledState.isFirstCycle() ? RecordBatch.MAX_BATCH_ROW_COUNT : RECORDS_PER_BATCH;
boolean doMemoryCalculation = canSpill && !probeSideIsEmpty.booleanValue();
HashJoinMemoryCalculator calc = getCalculatorImpl();
calc.initialize(doMemoryCalculation);
buildCalc = calc.next();
// TODO Fix after
buildCalc.initialize(// TODO Fix after
spilledState.isFirstCycle(), // TODO Fix after
true, // fixed
buildBatch, probeBatch, buildJoinColumns, probeSideIsEmpty.booleanValue(), allocator.getLimit(), numPartitions, RECORDS_PER_BATCH, RECORDS_PER_BATCH, maxBatchSize, maxBatchSize, batchMemoryManager.getOutputBatchSize(), HashTable.DEFAULT_LOAD_FACTOR);
if (spilledState.isFirstCycle() && doMemoryCalculation) {
// Do auto tuning
buildCalc = partitionNumTuning(maxBatchSize, buildCalc);
}
}
if (spilledState.isFirstCycle()) {
// Do initial setup only on the first cycle
delayedSetup();
}
initializeBuild();
initializeRuntimeFilter();
// Make the calculator aware of our partitions
HashJoinMemoryCalculator.PartitionStatSet partitionStatSet = new HashJoinMemoryCalculator.PartitionStatSet(partitions);
buildCalc.setPartitionStatSet(partitionStatSet);
boolean moreData = true;
while (moreData) {
switch(rightUpstream) {
case NONE:
case NOT_YET:
moreData = false;
continue;
case OK_NEW_SCHEMA:
if (!buildSchema.equals(buildBatch.getSchema())) {
throw SchemaChangeException.schemaChanged("Hash join does not support schema changes in build side.", buildSchema, buildBatch.getSchema());
}
for (HashPartition partn : partitions) {
partn.updateBatches();
}
// Fall through
case OK:
batchMemoryManager.update(buildBatch, RIGHT_INDEX, 0, true);
int currentRecordCount = buildBatch.getRecordCount();
// create runtime filter
if (spilledState.isFirstCycle() && enableRuntimeFilter) {
// create runtime filter and send out async
for (BloomFilter bloomFilter : bloomFilter2buildId.keySet()) {
int fieldId = bloomFilter2buildId.get(bloomFilter);
for (int ind = 0; ind < currentRecordCount; ind++) {
long hashCode = hash64.hash64Code(ind, 0, fieldId);
bloomFilter.insert(hashCode);
}
}
}
// incoming vectors as they are (no row copy)
if (numPartitions == 1) {
partitions[0].appendBatch(buildBatch);
break;
}
if (!spilledState.isFirstCycle()) {
read_right_HV_vector = (IntVector) buildBatch.getContainer().getLast();
}
// the result
for (int ind = 0; ind < currentRecordCount; ind++) {
int hashCode = spilledState.isFirstCycle() ? partitions[0].getBuildHashCode(ind) : // get the hash
read_right_HV_vector.getAccessor().get(ind);
// value from the
// HV column
int currPart = hashCode & spilledState.getPartitionMask();
hashCode >>>= spilledState.getBitsInMask();
// semi-join skips join-key-duplicate rows
if (semiJoin) {
}
// Append the new inner row to the appropriate partition; spill (that
// partition) if needed
partitions[currPart].appendInnerRow(buildBatch.getContainer(), ind, hashCode, buildCalc);
}
if (read_right_HV_vector != null) {
read_right_HV_vector.clear();
read_right_HV_vector = null;
}
break;
default:
throw new IllegalStateException(rightUpstream.name());
}
// Get the next incoming record batch
rightUpstream = next(HashJoinHelper.RIGHT_INPUT, buildBatch);
}
if (spilledState.isFirstCycle() && enableRuntimeFilter) {
if (bloomFilter2buildId.size() > 0) {
int hashJoinOpId = this.popConfig.getOperatorId();
runtimeFilterReporter.sendOut(bloomFilters, probeFields, this.popConfig.getRuntimeFilterDef(), hashJoinOpId);
}
}
// the spilled partitions list
if (numPartitions > 1) {
// a single partition needs no completion
for (HashPartition partn : partitions) {
partn.completeAnInnerBatch(false, partn.isSpilled());
}
}
prefetchFirstProbeBatch();
if (leftUpstream.isError()) {
// We need to terminate.
return leftUpstream;
}
HashJoinMemoryCalculator.PostBuildCalculations postBuildCalc = buildCalc.next();
// probeEmpty
postBuildCalc.initialize(probeSideIsEmpty.booleanValue());
for (int index = 0; index < partitions.length; index++) {
HashPartition partn = partitions[index];
if (partn.isSpilled()) {
// Don't build hash tables for spilled partitions
continue;
}
try {
if (postBuildCalc.shouldSpill()) {
// Spill this partition if we need to make room
partn.spillThisPartition();
} else {
// Only build hash tables for partitions that are not spilled
partn.buildContainersHashTableAndHelper();
}
} catch (OutOfMemoryException e) {
String message = "Failed building hash table on partition " + index + ":\n" + makeDebugString() + "\n" + postBuildCalc.makeDebugString();
// Include debug info
throw new OutOfMemoryException(message, e);
}
}
if (logger.isDebugEnabled()) {
logger.debug(postBuildCalc.makeDebugString());
}
for (HashPartition partn : partitions) {
if (partn.isSpilled()) {
HashJoinSpilledPartition sp = new HashJoinSpilledPartition(spilledState.getCycle(), partn.getPartitionNum(), originalPartition, partn.getPartitionBatchesCount(), partn.getSpillFile());
spilledState.addPartition(sp);
// for the outer to find
spilledInners[partn.getPartitionNum()] = sp;
// the SP later
partn.closeWriter();
partn.updateProbeRecordsPerBatch(postBuildCalc.getProbeRecordsPerBatch());
}
}
return null;
}
use of org.apache.drill.exec.work.filter.BloomFilter in project drill by apache.
the class HashJoinBatch method initializeRuntimeFilter.
/**
* Note: This method can not be called again as part of recursive call of
* executeBuildPhase() to handle spilled build partitions.
*/
private void initializeRuntimeFilter() {
if (!enableRuntimeFilter || bloomFiltersGenerated) {
return;
}
runtimeFilterReporter = new RuntimeFilterReporter((ExecutorFragmentContext) context);
RuntimeFilterDef runtimeFilterDef = popConfig.getRuntimeFilterDef();
// RuntimeFilterRouter's judgement will have the RuntimeFilterDef.
if (runtimeFilterDef != null) {
List<BloomFilterDef> bloomFilterDefs = runtimeFilterDef.getBloomFilterDefs();
for (BloomFilterDef bloomFilterDef : bloomFilterDefs) {
int buildFieldId = bloomFilterDef2buildId.get(bloomFilterDef);
int numBytes = bloomFilterDef.getNumBytes();
String probeField = bloomFilterDef.getProbeField();
probeFields.add(probeField);
BloomFilter bloomFilter = new BloomFilter(numBytes, context.getAllocator());
bloomFilters.add(bloomFilter);
bloomFilter2buildId.put(bloomFilter, buildFieldId);
}
}
bloomFiltersGenerated = true;
}
Aggregations