use of org.apache.drill.exec.physical.impl.common.HashPartition in project drill by apache.
the class HashJoinBatch method updateStats.
/**
* Updates the {@link HashTable} and spilling stats after the original build
* side is processed.
*
* Note: this does not update all the stats. The cycleNum is updated
* dynamically in {@link #innerNext()} and the total bytes written is updated
* at close time in {@link #cleanup()}.
*/
private void updateStats() {
if (buildSideIsEmpty.booleanValue()) {
return;
}
// no stats when the right side is empty
if (!spilledState.isFirstCycle()) {
return;
}
// These stats are only for before processing spilled files
HashTableStats htStats = new HashTableStats();
long numSpilled = 0;
HashTableStats newStats = new HashTableStats();
// sum the stats from all the partitions
for (HashPartition partn : partitions) {
if (partn.isSpilled()) {
numSpilled++;
}
partn.getStats(newStats);
htStats.addStats(newStats);
}
stats.setLongStat(Metric.NUM_BUCKETS, htStats.numBuckets);
stats.setLongStat(Metric.NUM_ENTRIES, htStats.numEntries);
stats.setLongStat(Metric.NUM_RESIZING, htStats.numResizing);
stats.setLongStat(Metric.RESIZING_TIME_MS, htStats.resizingTime);
stats.setLongStat(Metric.NUM_PARTITIONS, numPartitions);
// Put 0 in
stats.setLongStat(Metric.SPILL_CYCLE, spilledState.getCycle());
// case no
// spill
stats.setLongStat(Metric.SPILLED_PARTITIONS, numSpilled);
}
use of org.apache.drill.exec.physical.impl.common.HashPartition in project drill by apache.
the class HashJoinProbeTemplate method executeProbePhase.
private void executeProbePhase() throws SchemaChangeException {
while (outputRecords < targetOutputRecords && probeState != ProbeState.DONE && probeState != ProbeState.PROJECT_RIGHT) {
// Check if we have processed all records in this batch we need to invoke next
if (recordsProcessed == recordsToProcess) {
// Done processing all records in the previous batch, clean up!
for (VectorWrapper<?> wrapper : probeBatch) {
wrapper.getValueVector().clear();
}
IterOutcome leftUpstream = outgoingJoinBatch.next(HashJoinHelper.LEFT_INPUT, probeBatch);
switch(leftUpstream) {
case NONE:
case NOT_YET:
recordsProcessed = 0;
recordsToProcess = 0;
changeToFinalProbeState();
// in case some outer partitions were spilled, need to spill their last batches
for (HashPartition partn : partitions) {
// skip non-spilled
if (!partn.isSpilled()) {
continue;
}
partn.completeAnOuterBatch(false);
// update the partition's spill record with the outer side
HashJoinBatch.HashJoinSpilledPartition sp = spilledInners[partn.getPartitionNum()];
sp.updateOuter(partn.getPartitionBatchesCount(), partn.getSpillFile());
partn.closeWriter();
}
continue;
case OK_NEW_SCHEMA:
if (probeBatch.getSchema().equals(probeSchema)) {
for (HashPartition partn : partitions) {
partn.updateBatches();
}
} else {
throw SchemaChangeException.schemaChanged("Hash join does not support schema changes in probe side.", probeSchema, probeBatch.getSchema());
}
case OK:
outgoingJoinBatch.getBatchMemoryManager().update(probeBatch, LEFT_INDEX, outputRecords);
// calculated by update()
setTargetOutputCount(outgoingJoinBatch.getBatchMemoryManager().getCurrentOutgoingMaxRowCount());
recordsToProcess = probeBatch.getRecordCount();
recordsProcessed = 0;
// If we received an empty batch do nothing
if (recordsToProcess == 0) {
continue;
}
if (cycleNum > 0) {
// Needed ?
read_left_HV_vector = (IntVector) probeBatch.getContainer().getLast();
}
break;
default:
}
}
int probeIndex = -1;
// Check if we need to drain the next row in the probe side
if (getNextRecord) {
if (!buildSideIsEmpty) {
int hashCode = (cycleNum == 0) ? partitions[0].getProbeHashCode(recordsProcessed) : read_left_HV_vector.getAccessor().get(recordsProcessed);
int currBuildPart = hashCode & partitionMask;
hashCode >>>= bitsInMask;
// Set and keep the current partition (may be used again on subsequent probe calls as
// inner rows of duplicate key are processed)
// inner if not spilled, else outer
currPartition = partitions[currBuildPart];
// If the matching inner partition was spilled
if (outgoingJoinBatch.isSpilledInner(currBuildPart)) {
// add this row to its outer partition (may cause a spill, when the batch is full)
currPartition.appendOuterRow(hashCode, recordsProcessed);
// done with this outer record
recordsProcessed++;
// on to the next outer record
continue;
}
probeIndex = currPartition.probeForKey(recordsProcessed, hashCode);
}
if (semiJoin) {
if (probeIndex != -1) {
// output the probe side only
outputRecords = outputRow(null, 0, probeBatch.getContainer(), recordsProcessed);
}
recordsProcessed++;
// no build-side duplicates, go on to the next probe-side row
continue;
}
if (probeIndex != -1) {
/* The current probe record has a key that matches. Get the index
* of the first row in the build side that matches the current key
* (and record this match in the bitmap, in case of a FULL/RIGHT join)
*/
Pair<Integer, Boolean> matchStatus = currPartition.getStartIndex(probeIndex);
boolean matchExists = matchStatus.getRight();
if (joinControl.isIntersectDistinct() && matchExists) {
// since it is intersect distinct and we already have one record matched, move to next probe row
recordsProcessed++;
continue;
}
currentCompositeIdx = matchStatus.getLeft();
outputRecords = outputRow(currPartition.getContainers(), currentCompositeIdx, probeBatch.getContainer(), recordsProcessed);
/* Projected single row from the build side with matching key but there
* may be more rows with the same key. Check if that's the case as long as
* we are not doing intersect distinct since it only cares about
* distinct values.
*/
currentCompositeIdx = joinControl.isIntersectDistinct() ? -1 : currPartition.getNextIndex(currentCompositeIdx);
if (currentCompositeIdx == -1) {
/* We only had one row in the build side that matched the current key
* from the probe side. Drain the next row in the probe side.
*/
recordsProcessed++;
} else {
/* There is more than one row with the same key on the build side
* don't drain more records from the probe side till we have projected
* all the rows with this key
*/
getNextRecord = false;
}
} else {
// If we have a left outer join, project the outer side
if (joinType == JoinRelType.LEFT || joinType == JoinRelType.FULL) {
// output only the probe side (the build side would be all nulls)
outputRecords = outputRow(null, 0, probeBatch.getContainer(), recordsProcessed);
}
recordsProcessed++;
}
} else {
// match the next inner row with the same key
currPartition.setRecordMatched(currentCompositeIdx);
outputRecords = outputRow(currPartition.getContainers(), currentCompositeIdx, probeBatch.getContainer(), recordsProcessed);
currentCompositeIdx = currPartition.getNextIndex(currentCompositeIdx);
if (currentCompositeIdx == -1) {
// We don't have any more rows matching the current key on the build side, move on to the next probe row
getNextRecord = true;
recordsProcessed++;
}
}
}
}
use of org.apache.drill.exec.physical.impl.common.HashPartition in project drill by apache.
the class HashJoinBatch method executeBuildPhase.
/**
* Execute the BUILD phase; first read incoming and split rows into
* partitions; may decide to spill some of the partitions
*
* @return Returns an
* {@link org.apache.drill.exec.record.RecordBatch.IterOutcome} if a
* termination condition is reached. Otherwise returns null.
* @throws SchemaChangeException
*/
public IterOutcome executeBuildPhase() throws SchemaChangeException {
if (buildSideIsEmpty.booleanValue()) {
// empty right
return null;
}
if (skipHashTableBuild) {
// No hash table needed - then consume all the
// right upstream
killAndDrainRightUpstream();
return null;
}
HashJoinMemoryCalculator.BuildSidePartitioning buildCalc;
{
// Initializing build calculator
// Limit scope of these variables to this block
int maxBatchSize = spilledState.isFirstCycle() ? RecordBatch.MAX_BATCH_ROW_COUNT : RECORDS_PER_BATCH;
boolean doMemoryCalculation = canSpill && !probeSideIsEmpty.booleanValue();
HashJoinMemoryCalculator calc = getCalculatorImpl();
calc.initialize(doMemoryCalculation);
buildCalc = calc.next();
// TODO Fix after
buildCalc.initialize(// TODO Fix after
spilledState.isFirstCycle(), // TODO Fix after
true, // fixed
buildBatch, probeBatch, buildJoinColumns, probeSideIsEmpty.booleanValue(), allocator.getLimit(), numPartitions, RECORDS_PER_BATCH, RECORDS_PER_BATCH, maxBatchSize, maxBatchSize, batchMemoryManager.getOutputBatchSize(), HashTable.DEFAULT_LOAD_FACTOR);
if (spilledState.isFirstCycle() && doMemoryCalculation) {
// Do auto tuning
buildCalc = partitionNumTuning(maxBatchSize, buildCalc);
}
}
if (spilledState.isFirstCycle()) {
// Do initial setup only on the first cycle
delayedSetup();
}
initializeBuild();
initializeRuntimeFilter();
// Make the calculator aware of our partitions
HashJoinMemoryCalculator.PartitionStatSet partitionStatSet = new HashJoinMemoryCalculator.PartitionStatSet(partitions);
buildCalc.setPartitionStatSet(partitionStatSet);
boolean moreData = true;
while (moreData) {
switch(rightUpstream) {
case NONE:
case NOT_YET:
moreData = false;
continue;
case OK_NEW_SCHEMA:
if (!buildSchema.equals(buildBatch.getSchema())) {
throw SchemaChangeException.schemaChanged("Hash join does not support schema changes in build side.", buildSchema, buildBatch.getSchema());
}
for (HashPartition partn : partitions) {
partn.updateBatches();
}
// Fall through
case OK:
batchMemoryManager.update(buildBatch, RIGHT_INDEX, 0, true);
int currentRecordCount = buildBatch.getRecordCount();
// create runtime filter
if (spilledState.isFirstCycle() && enableRuntimeFilter) {
// create runtime filter and send out async
for (BloomFilter bloomFilter : bloomFilter2buildId.keySet()) {
int fieldId = bloomFilter2buildId.get(bloomFilter);
for (int ind = 0; ind < currentRecordCount; ind++) {
long hashCode = hash64.hash64Code(ind, 0, fieldId);
bloomFilter.insert(hashCode);
}
}
}
// incoming vectors as they are (no row copy)
if (numPartitions == 1) {
partitions[0].appendBatch(buildBatch);
break;
}
if (!spilledState.isFirstCycle()) {
read_right_HV_vector = (IntVector) buildBatch.getContainer().getLast();
}
// the result
for (int ind = 0; ind < currentRecordCount; ind++) {
int hashCode = spilledState.isFirstCycle() ? partitions[0].getBuildHashCode(ind) : // get the hash
read_right_HV_vector.getAccessor().get(ind);
// value from the
// HV column
int currPart = hashCode & spilledState.getPartitionMask();
hashCode >>>= spilledState.getBitsInMask();
// semi-join skips join-key-duplicate rows
if (semiJoin) {
}
// Append the new inner row to the appropriate partition; spill (that
// partition) if needed
partitions[currPart].appendInnerRow(buildBatch.getContainer(), ind, hashCode, buildCalc);
}
if (read_right_HV_vector != null) {
read_right_HV_vector.clear();
read_right_HV_vector = null;
}
break;
default:
throw new IllegalStateException(rightUpstream.name());
}
// Get the next incoming record batch
rightUpstream = next(HashJoinHelper.RIGHT_INPUT, buildBatch);
}
if (spilledState.isFirstCycle() && enableRuntimeFilter) {
if (bloomFilter2buildId.size() > 0) {
int hashJoinOpId = this.popConfig.getOperatorId();
runtimeFilterReporter.sendOut(bloomFilters, probeFields, this.popConfig.getRuntimeFilterDef(), hashJoinOpId);
}
}
// the spilled partitions list
if (numPartitions > 1) {
// a single partition needs no completion
for (HashPartition partn : partitions) {
partn.completeAnInnerBatch(false, partn.isSpilled());
}
}
prefetchFirstProbeBatch();
if (leftUpstream.isError()) {
// We need to terminate.
return leftUpstream;
}
HashJoinMemoryCalculator.PostBuildCalculations postBuildCalc = buildCalc.next();
// probeEmpty
postBuildCalc.initialize(probeSideIsEmpty.booleanValue());
for (int index = 0; index < partitions.length; index++) {
HashPartition partn = partitions[index];
if (partn.isSpilled()) {
// Don't build hash tables for spilled partitions
continue;
}
try {
if (postBuildCalc.shouldSpill()) {
// Spill this partition if we need to make room
partn.spillThisPartition();
} else {
// Only build hash tables for partitions that are not spilled
partn.buildContainersHashTableAndHelper();
}
} catch (OutOfMemoryException e) {
String message = "Failed building hash table on partition " + index + ":\n" + makeDebugString() + "\n" + postBuildCalc.makeDebugString();
// Include debug info
throw new OutOfMemoryException(message, e);
}
}
if (logger.isDebugEnabled()) {
logger.debug(postBuildCalc.makeDebugString());
}
for (HashPartition partn : partitions) {
if (partn.isSpilled()) {
HashJoinSpilledPartition sp = new HashJoinSpilledPartition(spilledState.getCycle(), partn.getPartitionNum(), originalPartition, partn.getPartitionBatchesCount(), partn.getSpillFile());
spilledState.addPartition(sp);
// for the outer to find
spilledInners[partn.getPartitionNum()] = sp;
// the SP later
partn.closeWriter();
partn.updateProbeRecordsPerBatch(postBuildCalc.getProbeRecordsPerBatch());
}
}
return null;
}
use of org.apache.drill.exec.physical.impl.common.HashPartition in project drill by apache.
the class HashJoinBatch method makeDebugString.
/**
* This creates a string that summarizes the memory usage of the operator.
*
* @return A memory dump string.
*/
public String makeDebugString() {
StringBuilder sb = new StringBuilder();
for (int partitionIndex = 0; partitionIndex < partitions.length; partitionIndex++) {
String partitionPrefix = "Partition " + partitionIndex + ": ";
HashPartition hashPartition = partitions[partitionIndex];
sb.append(partitionPrefix).append(hashPartition.makeDebugString()).append("\n");
}
return sb.toString();
}
use of org.apache.drill.exec.physical.impl.common.HashPartition in project drill by apache.
the class HashJoinBatch method innerNext.
@Override
public IterOutcome innerNext() {
if (wasKilled) {
// We have received a cancel signal. We need to stop processing.
cleanup();
return IterOutcome.NONE;
}
prefetchFirstBuildBatch();
if (rightUpstream.isError()) {
// We need to terminate.
return rightUpstream;
}
try {
/*
* If we are here for the first time, execute the build phase of the hash
* join and setup the run time generated class for the probe side
*/
if (state == BatchState.FIRST) {
// Build the hash table, using the build side record batches.
IterOutcome buildExecuteTermination = executeBuildPhase();
if (buildExecuteTermination != null) {
// We need to terminate.
return buildExecuteTermination;
}
buildComplete = true;
if (isRowKeyJoin) {
// discard the first left batch which was fetched by buildSchema, and
// get the new
// one based on rowkey join
leftUpstream = next(left);
}
// Update the hash table related stats for the operator
updateStats();
}
// Try to probe and project, or recursively handle a spilled partition
if (// If there are build-side rows
!buildSideIsEmpty.booleanValue() || joinIsLeftOrFull) {
// or if this is a left/full outer join
prefetchFirstProbeBatch();
if (leftUpstream.isError() || (leftUpstream == NONE && !joinIsRightOrFull)) {
// We need to terminate.
return leftUpstream;
}
if (!buildSideIsEmpty.booleanValue() || !probeSideIsEmpty.booleanValue()) {
if (state == BatchState.FIRST) {
// Initialize various settings for the probe side
hashJoinProbe.setupHashJoinProbe(probeBatch, this, joinType, semiJoin, leftUpstream, partitions, spilledState.getCycle(), container, spilledInners, buildSideIsEmpty.booleanValue(), numPartitions, rightHVColPosition);
}
// Allocate the memory for the vectors in the output container
batchMemoryManager.allocateVectors(container);
hashJoinProbe.setTargetOutputCount(batchMemoryManager.getOutputRowCount());
outputRecords = hashJoinProbe.probeAndProject();
container.setValueCount(outputRecords);
batchMemoryManager.updateOutgoingStats(outputRecords);
RecordBatchStats.logRecordBatchStats(RecordBatchIOType.OUTPUT, this, getRecordBatchStatsContext());
/*
* We are here because of one the following 1. Completed processing of
* all the records and we are done 2. We've filled up the outgoing
* batch to the maximum and we need to return upstream Either case
* build the output container's schema and return
*/
if (outputRecords > 0 || state == BatchState.FIRST) {
state = BatchState.NOT_FIRST;
return IterOutcome.OK;
}
}
// (In case need to start processing spilled partitions)
for (HashPartition partn : partitions) {
// clean, but do not delete the spill files !!
partn.cleanup(false);
}
//
if (!buildSideIsEmpty.booleanValue()) {
while (!spilledState.isEmpty()) {
// "while" is only used for
// skipping; see "continue" below
// Get the next (previously) spilled partition to handle as incoming
HashJoinSpilledPartition currSp = spilledState.getNextSpilledPartition();
// next spilled partition
if (currSp.outerSpilledBatches == 0 && !joinIsRightOrFull) {
continue;
}
// Create a BUILD-side "incoming" out of the inner spill file of
// that partition
buildBatch = new SpilledRecordBatch(currSp.innerSpillFile, currSp.innerSpilledBatches, context, buildSchema, oContext, spillSet);
// The above ctor call also got the first batch; need to update the
// outcome
rightUpstream = ((SpilledRecordBatch) buildBatch).getInitialOutcome();
if (currSp.outerSpilledBatches > 0) {
// Create a PROBE-side "incoming" out of the outer spill file of
// that partition
probeBatch = new SpilledRecordBatch(currSp.outerSpillFile, currSp.outerSpilledBatches, context, probeSchema, oContext, spillSet);
// The above ctor call also got the first batch; need to update
// the outcome
leftUpstream = ((SpilledRecordBatch) probeBatch).getInitialOutcome();
} else {
// if no outer batch then reuse left - needed
probeBatch = left;
// for updateIncoming()
leftUpstream = IterOutcome.NONE;
hashJoinProbe.changeToFinalProbeState();
}
spilledState.updateCycle(stats, currSp, spilledStateUpdater);
// TODO need to determine if this is still
state = BatchState.FIRST;
// necessary since
// prefetchFirstBatchFromBothSides sets
// this
prefetchedBuild.setValue(false);
prefetchedProbe.setValue(false);
// start processing the next spilled partition
return innerNext();
// "recursively"
}
}
} else {
// Our build side is empty, we won't have any matches, clear the probe
// side
killAndDrainLeftUpstream();
}
// No more output records, clean up and return
state = BatchState.DONE;
cleanup();
return IterOutcome.NONE;
} catch (SchemaChangeException e) {
throw UserException.schemaChangeError(e).build(logger);
}
}
Aggregations