use of org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashTableResult in project hive by apache.
the class VectorMapJoinLeftSemiMultiKeyOperator method process.
//---------------------------------------------------------------------------
// Process Multi-Key Left-Semi Join on a vectorized row batch.
//
@Override
public void process(Object row, int tag) throws HiveException {
try {
VectorizedRowBatch batch = (VectorizedRowBatch) row;
alias = (byte) tag;
if (needCommonSetup) {
// Our one time process method initialization.
commonSetup(batch);
/*
* Initialize Multi-Key members for this specialized class.
*/
keyVectorSerializeWrite = new VectorSerializeRow(new BinarySortableSerializeWrite(bigTableKeyColumnMap.length));
keyVectorSerializeWrite.init(bigTableKeyTypeInfos, bigTableKeyColumnMap);
currentKeyOutput = new Output();
saveKeyOutput = new Output();
needCommonSetup = false;
}
if (needHashTableSetup) {
// Setup our hash table specialization. It will be the first time the process
// method is called, or after a Hybrid Grace reload.
/*
* Get our Multi-Key hash set information for this specialized class.
*/
hashSet = (VectorMapJoinBytesHashSet) vectorMapJoinHashTable;
needHashTableSetup = false;
}
batchCounter++;
// For left semi joins, we may apply the filter(s) now.
for (VectorExpression ve : bigTableFilterExpressions) {
ve.evaluate(batch);
}
final int inputLogicalSize = batch.size;
if (inputLogicalSize == 0) {
if (isLogDebugEnabled) {
LOG.debug(CLASS_NAME + " batch #" + batchCounter + " empty");
}
return;
}
// Perform any key expressions. Results will go into scratch columns.
if (bigTableKeyExpressions != null) {
for (VectorExpression ve : bigTableKeyExpressions) {
ve.evaluate(batch);
}
}
/*
* Multi-Key specific declarations.
*/
// None.
/*
* Multi-Key Long check for repeating.
*/
// If all BigTable input columns to key expressions are isRepeating, then
// calculate key once; lookup once.
boolean allKeyInputColumnsRepeating;
if (bigTableKeyColumnMap.length == 0) {
allKeyInputColumnsRepeating = false;
} else {
allKeyInputColumnsRepeating = true;
for (int i = 0; i < bigTableKeyColumnMap.length; i++) {
if (!batch.cols[bigTableKeyColumnMap[i]].isRepeating) {
allKeyInputColumnsRepeating = false;
break;
}
}
}
if (allKeyInputColumnsRepeating) {
/*
* Repeating.
*/
// All key input columns are repeating. Generate key once. Lookup once.
// Since the key is repeated, we must use entry 0 regardless of selectedInUse.
/*
* Multi-Key specific repeated lookup.
*/
keyVectorSerializeWrite.setOutput(currentKeyOutput);
keyVectorSerializeWrite.serializeWrite(batch, 0);
JoinUtil.JoinResult joinResult;
if (keyVectorSerializeWrite.getHasAnyNulls()) {
joinResult = JoinUtil.JoinResult.NOMATCH;
} else {
byte[] keyBytes = currentKeyOutput.getData();
int keyLength = currentKeyOutput.getLength();
// LOG.debug(CLASS_NAME + " processOp all " + displayBytes(keyBytes, 0, keyLength));
joinResult = hashSet.contains(keyBytes, 0, keyLength, hashSetResults[0]);
}
if (isLogDebugEnabled) {
LOG.debug(CLASS_NAME + " batch #" + batchCounter + " repeated joinResult " + joinResult.name());
}
finishLeftSemiRepeated(batch, joinResult, hashSetResults[0]);
} else {
if (isLogDebugEnabled) {
LOG.debug(CLASS_NAME + " batch #" + batchCounter + " non-repeated");
}
// We remember any matching rows in matchs / matchSize. At the end of the loop,
// selected / batch.size will represent both matching and non-matching rows for outer join.
// Only deferred rows will have been removed from selected.
int[] selected = batch.selected;
boolean selectedInUse = batch.selectedInUse;
int hashSetResultCount = 0;
int allMatchCount = 0;
int spillCount = 0;
/*
* Multi-Key specific variables.
*/
Output temp;
// We optimize performance by only looking up the first key in a series of equal keys.
boolean haveSaveKey = false;
JoinUtil.JoinResult saveJoinResult = JoinUtil.JoinResult.NOMATCH;
// Logical loop over the rows in the batch since the batch may have selected in use.
for (int logical = 0; logical < inputLogicalSize; logical++) {
int batchIndex = (selectedInUse ? selected[logical] : logical);
/*
* Multi-Key get key.
*/
// Generate binary sortable key for current row in vectorized row batch.
keyVectorSerializeWrite.setOutput(currentKeyOutput);
keyVectorSerializeWrite.serializeWrite(batch, batchIndex);
boolean isAnyNull = keyVectorSerializeWrite.getHasAnyNulls();
if (isAnyNull || !haveSaveKey || !saveKeyOutput.arraysEquals(currentKeyOutput)) {
if (haveSaveKey) {
// Move on with our counts.
switch(saveJoinResult) {
case MATCH:
// We have extracted the existence from the hash set result, so we don't keep it.
break;
case SPILL:
// We keep the hash set result for its spill information.
hashSetResultCount++;
break;
case NOMATCH:
break;
}
}
if (isAnyNull) {
saveJoinResult = JoinUtil.JoinResult.NOMATCH;
haveSaveKey = false;
} else {
// Regardless of our matching result, we keep that information to make multiple use
// of it for a possible series of equal keys.
haveSaveKey = true;
/*
* Multi-Key specific save key and lookup.
*/
temp = saveKeyOutput;
saveKeyOutput = currentKeyOutput;
currentKeyOutput = temp;
/*
* Multi-key specific lookup key.
*/
byte[] keyBytes = saveKeyOutput.getData();
int keyLength = saveKeyOutput.getLength();
saveJoinResult = hashSet.contains(keyBytes, 0, keyLength, hashSetResults[hashSetResultCount]);
}
switch(saveJoinResult) {
case MATCH:
allMatchs[allMatchCount++] = batchIndex;
// VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " MATCH isSingleValue " + equalKeySeriesIsSingleValue[equalKeySeriesCount] + " currentKey " + currentKey);
break;
case SPILL:
spills[spillCount] = batchIndex;
spillHashMapResultIndices[spillCount] = hashSetResultCount;
spillCount++;
break;
case NOMATCH:
// VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " NOMATCH" + " currentKey " + currentKey);
break;
}
} else {
switch(saveJoinResult) {
case MATCH:
allMatchs[allMatchCount++] = batchIndex;
// VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " MATCH duplicate");
break;
case SPILL:
spills[spillCount] = batchIndex;
spillHashMapResultIndices[spillCount] = hashSetResultCount;
spillCount++;
break;
case NOMATCH:
// VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " NOMATCH duplicate");
break;
}
}
}
if (haveSaveKey) {
// Update our counts for the last key.
switch(saveJoinResult) {
case MATCH:
// We have extracted the existence from the hash set result, so we don't keep it.
break;
case SPILL:
// We keep the hash set result for its spill information.
hashSetResultCount++;
break;
case NOMATCH:
break;
}
}
if (isLogDebugEnabled) {
LOG.debug(CLASS_NAME + " allMatchs " + intArrayToRangesString(allMatchs, allMatchCount) + " spills " + intArrayToRangesString(spills, spillCount) + " spillHashMapResultIndices " + intArrayToRangesString(spillHashMapResultIndices, spillCount) + " hashMapResults " + Arrays.toString(Arrays.copyOfRange(hashSetResults, 0, hashSetResultCount)));
}
finishLeftSemi(batch, allMatchCount, spillCount, (VectorMapJoinHashTableResult[]) hashSetResults);
}
if (batch.size > 0) {
// Forward any remaining selected rows.
forwardBigTableBatch(batch);
}
} catch (IOException e) {
throw new HiveException(e);
} catch (Exception e) {
throw new HiveException(e);
}
}
use of org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashTableResult in project hive by apache.
the class VectorMapJoinLeftSemiStringOperator method process.
//---------------------------------------------------------------------------
// Process Single-Column String Left-Semi Join on a vectorized row batch.
//
@Override
public void process(Object row, int tag) throws HiveException {
try {
VectorizedRowBatch batch = (VectorizedRowBatch) row;
alias = (byte) tag;
if (needCommonSetup) {
// Our one time process method initialization.
commonSetup(batch);
/*
* Initialize Single-Column String members for this specialized class.
*/
singleJoinColumn = bigTableKeyColumnMap[0];
needCommonSetup = false;
}
if (needHashTableSetup) {
// Setup our hash table specialization. It will be the first time the process
// method is called, or after a Hybrid Grace reload.
/*
* Get our Single-Column String hash set information for this specialized class.
*/
hashSet = (VectorMapJoinBytesHashSet) vectorMapJoinHashTable;
needHashTableSetup = false;
}
batchCounter++;
// For left semi joins, we may apply the filter(s) now.
for (VectorExpression ve : bigTableFilterExpressions) {
ve.evaluate(batch);
}
final int inputLogicalSize = batch.size;
if (inputLogicalSize == 0) {
if (isLogDebugEnabled) {
LOG.debug(CLASS_NAME + " batch #" + batchCounter + " empty");
}
return;
}
// Perform any key expressions. Results will go into scratch columns.
if (bigTableKeyExpressions != null) {
for (VectorExpression ve : bigTableKeyExpressions) {
ve.evaluate(batch);
}
}
/*
* Single-Column String specific declarations.
*/
// The one join column for this specialized class.
BytesColumnVector joinColVector = (BytesColumnVector) batch.cols[singleJoinColumn];
byte[][] vector = joinColVector.vector;
int[] start = joinColVector.start;
int[] length = joinColVector.length;
/*
* Single-Column Long check for repeating.
*/
// Check single column for repeating.
boolean allKeyInputColumnsRepeating = joinColVector.isRepeating;
if (allKeyInputColumnsRepeating) {
/*
* Repeating.
*/
// All key input columns are repeating. Generate key once. Lookup once.
// Since the key is repeated, we must use entry 0 regardless of selectedInUse.
/*
* Single-Column String specific repeated lookup.
*/
JoinUtil.JoinResult joinResult;
if (!joinColVector.noNulls && joinColVector.isNull[0]) {
joinResult = JoinUtil.JoinResult.NOMATCH;
} else {
byte[] keyBytes = vector[0];
int keyStart = start[0];
int keyLength = length[0];
joinResult = hashSet.contains(keyBytes, keyStart, keyLength, hashSetResults[0]);
}
if (isLogDebugEnabled) {
LOG.debug(CLASS_NAME + " batch #" + batchCounter + " repeated joinResult " + joinResult.name());
}
finishLeftSemiRepeated(batch, joinResult, hashSetResults[0]);
} else {
if (isLogDebugEnabled) {
LOG.debug(CLASS_NAME + " batch #" + batchCounter + " non-repeated");
}
// We remember any matching rows in matchs / matchSize. At the end of the loop,
// selected / batch.size will represent both matching and non-matching rows for outer join.
// Only deferred rows will have been removed from selected.
int[] selected = batch.selected;
boolean selectedInUse = batch.selectedInUse;
int hashSetResultCount = 0;
int allMatchCount = 0;
int spillCount = 0;
/*
* Single-Column String specific variables.
*/
int saveKeyBatchIndex = -1;
// We optimize performance by only looking up the first key in a series of equal keys.
boolean haveSaveKey = false;
JoinUtil.JoinResult saveJoinResult = JoinUtil.JoinResult.NOMATCH;
// Logical loop over the rows in the batch since the batch may have selected in use.
for (int logical = 0; logical < inputLogicalSize; logical++) {
int batchIndex = (selectedInUse ? selected[logical] : logical);
/*
* Single-Column String get key.
*/
// Implicit -- use batchIndex.
boolean isNull = !joinColVector.noNulls && joinColVector.isNull[batchIndex];
if (isNull || !haveSaveKey || StringExpr.equal(vector[saveKeyBatchIndex], start[saveKeyBatchIndex], length[saveKeyBatchIndex], vector[batchIndex], start[batchIndex], length[batchIndex]) == false) {
if (haveSaveKey) {
// Move on with our counts.
switch(saveJoinResult) {
case MATCH:
// We have extracted the existence from the hash set result, so we don't keep it.
break;
case SPILL:
// We keep the hash set result for its spill information.
hashSetResultCount++;
break;
case NOMATCH:
break;
}
}
if (isNull) {
saveJoinResult = JoinUtil.JoinResult.NOMATCH;
haveSaveKey = false;
} else {
// Regardless of our matching result, we keep that information to make multiple use
// of it for a possible series of equal keys.
haveSaveKey = true;
/*
* Single-Column String specific save key and lookup.
*/
saveKeyBatchIndex = batchIndex;
/*
* Single-Column String specific lookup key.
*/
byte[] keyBytes = vector[batchIndex];
int keyStart = start[batchIndex];
int keyLength = length[batchIndex];
saveJoinResult = hashSet.contains(keyBytes, keyStart, keyLength, hashSetResults[hashSetResultCount]);
}
switch(saveJoinResult) {
case MATCH:
allMatchs[allMatchCount++] = batchIndex;
// VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " MATCH isSingleValue " + equalKeySeriesIsSingleValue[equalKeySeriesCount] + " currentKey " + currentKey);
break;
case SPILL:
spills[spillCount] = batchIndex;
spillHashMapResultIndices[spillCount] = hashSetResultCount;
spillCount++;
break;
case NOMATCH:
// VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " NOMATCH" + " currentKey " + currentKey);
break;
}
} else {
switch(saveJoinResult) {
case MATCH:
allMatchs[allMatchCount++] = batchIndex;
// VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " MATCH duplicate");
break;
case SPILL:
spills[spillCount] = batchIndex;
spillHashMapResultIndices[spillCount] = hashSetResultCount;
spillCount++;
break;
case NOMATCH:
// VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " NOMATCH duplicate");
break;
}
}
}
if (haveSaveKey) {
// Update our counts for the last key.
switch(saveJoinResult) {
case MATCH:
// We have extracted the existence from the hash set result, so we don't keep it.
break;
case SPILL:
// We keep the hash set result for its spill information.
hashSetResultCount++;
break;
case NOMATCH:
break;
}
}
if (isLogDebugEnabled) {
LOG.debug(CLASS_NAME + " allMatchs " + intArrayToRangesString(allMatchs, allMatchCount) + " spills " + intArrayToRangesString(spills, spillCount) + " spillHashMapResultIndices " + intArrayToRangesString(spillHashMapResultIndices, spillCount) + " hashMapResults " + Arrays.toString(Arrays.copyOfRange(hashSetResults, 0, hashSetResultCount)));
}
finishLeftSemi(batch, allMatchCount, spillCount, (VectorMapJoinHashTableResult[]) hashSetResults);
}
if (batch.size > 0) {
// Forward any remaining selected rows.
forwardBigTableBatch(batch);
}
} catch (IOException e) {
throw new HiveException(e);
} catch (Exception e) {
throw new HiveException(e);
}
}
use of org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashTableResult in project hive by apache.
the class VectorMapJoinOuterGenerateResultOperator method finishOuter.
/**
* Generate the outer join output results for one vectorized row batch.
*
* @param batch
* The big table batch with any matching and any non matching rows both as
* selected in use.
* @param allMatchCount
* Number of matches in allMatchs.
* @param equalKeySeriesCount
* Number of single value matches.
* @param atLeastOneNonMatch
* Whether at least one row was a non-match.
* @param inputSelectedInUse
* A copy of the batch's selectedInUse flag on input to the process method.
* @param inputLogicalSize
* The batch's size on input to the process method.
* @param spillCount
* Number of spills in spills.
* @param hashMapResultCount
* Number of entries in hashMapResults.
*/
public void finishOuter(VectorizedRowBatch batch, int allMatchCount, int equalKeySeriesCount, boolean atLeastOneNonMatch, boolean inputSelectedInUse, int inputLogicalSize, int spillCount, int hashMapResultCount) throws IOException, HiveException {
// Get rid of spills before we start modifying the batch.
if (spillCount > 0) {
spillHashMapBatch(batch, (VectorMapJoinHashTableResult[]) hashMapResults, spills, spillHashMapResultIndices, spillCount);
}
int noMatchCount = 0;
if (spillCount > 0) {
// Subtract the spills to get all match and non-match rows.
int nonSpillCount = subtractFromInputSelected(inputSelectedInUse, inputLogicalSize, spills, spillCount, nonSpills);
if (isLogDebugEnabled) {
LOG.debug("finishOuter spillCount > 0" + " nonSpills " + intArrayToRangesString(nonSpills, nonSpillCount));
}
// Big table value expressions apply to ALL matching and non-matching rows.
if (bigTableValueExpressions != null) {
doValueExpr(batch, nonSpills, nonSpillCount);
}
if (atLeastOneNonMatch) {
noMatchCount = subtract(nonSpills, nonSpillCount, allMatchs, allMatchCount, noMatchs);
if (isLogDebugEnabled) {
LOG.debug("finishOuter spillCount > 0" + " noMatchs " + intArrayToRangesString(noMatchs, noMatchCount));
}
}
} else {
// Run value expressions over original (whole) input batch.
doValueExprOnInputSelected(batch, inputSelectedInUse, inputLogicalSize);
if (atLeastOneNonMatch) {
noMatchCount = subtractFromInputSelected(inputSelectedInUse, inputLogicalSize, allMatchs, allMatchCount, noMatchs);
if (isLogDebugEnabled) {
LOG.debug("finishOuter spillCount == 0" + " noMatchs " + intArrayToRangesString(noMatchs, noMatchCount));
}
}
}
// overflow batch.
if (allMatchCount > 0) {
int numSel = 0;
for (int i = 0; i < equalKeySeriesCount; i++) {
int hashMapResultIndex = equalKeySeriesHashMapResultIndices[i];
VectorMapJoinHashMapResult hashMapResult = hashMapResults[hashMapResultIndex];
int allMatchesIndex = equalKeySeriesAllMatchIndices[i];
boolean isSingleValue = equalKeySeriesIsSingleValue[i];
int duplicateCount = equalKeySeriesDuplicateCounts[i];
if (isSingleValue) {
numSel = generateHashMapResultSingleValue(batch, hashMapResult, allMatchs, allMatchesIndex, duplicateCount, numSel);
} else {
generateHashMapResultMultiValue(batch, hashMapResult, allMatchs, allMatchesIndex, duplicateCount);
}
}
// The number of single value rows that were generated in the big table batch.
batch.size = numSel;
batch.selectedInUse = true;
if (isLogDebugEnabled) {
LOG.debug("finishOuter allMatchCount > 0" + " batch.selected " + intArrayToRangesString(batch.selected, batch.size));
}
} else {
batch.size = 0;
}
if (noMatchCount > 0) {
if (batch.size > 0) {
generateOuterNulls(batch, noMatchs, noMatchCount);
// Merge noMatchs and (match) selected.
int mergeCount = sortMerge(noMatchs, noMatchCount, batch.selected, batch.size, merged);
if (isLogDebugEnabled) {
LOG.debug("finishOuter noMatchCount > 0 && batch.size > 0" + " merged " + intArrayToRangesString(merged, mergeCount));
}
System.arraycopy(merged, 0, batch.selected, 0, mergeCount);
batch.size = mergeCount;
batch.selectedInUse = true;
} else {
// We can use the whole batch for output of no matches.
generateOuterNullsRepeatedAll(batch);
System.arraycopy(noMatchs, 0, batch.selected, 0, noMatchCount);
batch.size = noMatchCount;
batch.selectedInUse = true;
if (isLogDebugEnabled) {
LOG.debug("finishOuter noMatchCount > 0 && batch.size == 0" + " batch.selected " + intArrayToRangesString(batch.selected, batch.size));
}
}
}
}
use of org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashTableResult in project hive by apache.
the class VectorMapJoinInnerBigOnlyMultiKeyOperator method process.
//---------------------------------------------------------------------------
// Process Multi-Key Inner Big-Only Join on a vectorized row batch.
//
@Override
public void process(Object row, int tag) throws HiveException {
try {
VectorizedRowBatch batch = (VectorizedRowBatch) row;
alias = (byte) tag;
if (needCommonSetup) {
// Our one time process method initialization.
commonSetup(batch);
/*
* Initialize Multi-Key members for this specialized class.
*/
keyVectorSerializeWrite = new VectorSerializeRow(new BinarySortableSerializeWrite(bigTableKeyColumnMap.length));
keyVectorSerializeWrite.init(bigTableKeyTypeInfos, bigTableKeyColumnMap);
currentKeyOutput = new Output();
saveKeyOutput = new Output();
needCommonSetup = false;
}
if (needHashTableSetup) {
// Setup our hash table specialization. It will be the first time the process
// method is called, or after a Hybrid Grace reload.
/*
* Get our Multi-Key hash multi-set information for this specialized class.
*/
hashMultiSet = (VectorMapJoinBytesHashMultiSet) vectorMapJoinHashTable;
needHashTableSetup = false;
}
batchCounter++;
// For inner joins, we may apply the filter(s) now.
for (VectorExpression ve : bigTableFilterExpressions) {
ve.evaluate(batch);
}
final int inputLogicalSize = batch.size;
if (inputLogicalSize == 0) {
if (isLogDebugEnabled) {
LOG.debug(CLASS_NAME + " batch #" + batchCounter + " empty");
}
return;
}
// Perform any key expressions. Results will go into scratch columns.
if (bigTableKeyExpressions != null) {
for (VectorExpression ve : bigTableKeyExpressions) {
ve.evaluate(batch);
}
}
/*
* Multi-Key specific declarations.
*/
// None.
/*
* Multi-Key check for repeating.
*/
// If all BigTable input columns to key expressions are isRepeating, then
// calculate key once; lookup once.
boolean allKeyInputColumnsRepeating;
if (bigTableKeyColumnMap.length == 0) {
allKeyInputColumnsRepeating = false;
} else {
allKeyInputColumnsRepeating = true;
for (int i = 0; i < bigTableKeyColumnMap.length; i++) {
if (!batch.cols[bigTableKeyColumnMap[i]].isRepeating) {
allKeyInputColumnsRepeating = false;
break;
}
}
}
if (allKeyInputColumnsRepeating) {
/*
* Repeating.
*/
// All key input columns are repeating. Generate key once. Lookup once.
// Since the key is repeated, we must use entry 0 regardless of selectedInUse.
/*
* Multi-Key specific repeated lookup.
*/
keyVectorSerializeWrite.setOutput(currentKeyOutput);
keyVectorSerializeWrite.serializeWrite(batch, 0);
JoinUtil.JoinResult joinResult;
if (keyVectorSerializeWrite.getHasAnyNulls()) {
joinResult = JoinUtil.JoinResult.NOMATCH;
} else {
byte[] keyBytes = currentKeyOutput.getData();
int keyLength = currentKeyOutput.getLength();
joinResult = hashMultiSet.contains(keyBytes, 0, keyLength, hashMultiSetResults[0]);
}
if (isLogDebugEnabled) {
LOG.debug(CLASS_NAME + " batch #" + batchCounter + " repeated joinResult " + joinResult.name());
}
finishInnerBigOnlyRepeated(batch, joinResult, hashMultiSetResults[0]);
} else {
if (isLogDebugEnabled) {
LOG.debug(CLASS_NAME + " batch #" + batchCounter + " non-repeated");
}
// We remember any matching rows in matchs / matchSize. At the end of the loop,
// selected / batch.size will represent both matching and non-matching rows for outer join.
// Only deferred rows will have been removed from selected.
int[] selected = batch.selected;
boolean selectedInUse = batch.selectedInUse;
int hashMultiSetResultCount = 0;
int allMatchCount = 0;
int equalKeySeriesCount = 0;
int spillCount = 0;
/*
* Multi-Key specific variables.
*/
Output temp;
// We optimize performance by only looking up the first key in a series of equal keys.
boolean haveSaveKey = false;
JoinUtil.JoinResult saveJoinResult = JoinUtil.JoinResult.NOMATCH;
// Logical loop over the rows in the batch since the batch may have selected in use.
for (int logical = 0; logical < inputLogicalSize; logical++) {
int batchIndex = (selectedInUse ? selected[logical] : logical);
/*
* Multi-Key get key.
*/
// Generate binary sortable key for current row in vectorized row batch.
keyVectorSerializeWrite.setOutput(currentKeyOutput);
keyVectorSerializeWrite.serializeWrite(batch, batchIndex);
boolean isAnyNulls = keyVectorSerializeWrite.getHasAnyNulls();
if (isAnyNulls || !haveSaveKey || !saveKeyOutput.arraysEquals(currentKeyOutput)) {
if (haveSaveKey) {
// Move on with our counts.
switch(saveJoinResult) {
case MATCH:
// We have extracted the count from the hash multi-set result, so we don't keep it.
equalKeySeriesCount++;
break;
case SPILL:
// We keep the hash multi-set result for its spill information.
hashMultiSetResultCount++;
break;
case NOMATCH:
break;
}
}
if (isAnyNulls) {
saveJoinResult = JoinUtil.JoinResult.NOMATCH;
haveSaveKey = false;
} else {
// Regardless of our matching result, we keep that information to make multiple use
// of it for a possible series of equal keys.
haveSaveKey = true;
/*
* Multi-Key specific save key.
*/
temp = saveKeyOutput;
saveKeyOutput = currentKeyOutput;
currentKeyOutput = temp;
/*
* Single-Column Long specific lookup key.
*/
byte[] keyBytes = saveKeyOutput.getData();
int keyLength = saveKeyOutput.getLength();
saveJoinResult = hashMultiSet.contains(keyBytes, 0, keyLength, hashMultiSetResults[hashMultiSetResultCount]);
}
switch(saveJoinResult) {
case MATCH:
equalKeySeriesValueCounts[equalKeySeriesCount] = hashMultiSetResults[hashMultiSetResultCount].count();
equalKeySeriesAllMatchIndices[equalKeySeriesCount] = allMatchCount;
equalKeySeriesDuplicateCounts[equalKeySeriesCount] = 1;
allMatchs[allMatchCount++] = batchIndex;
// VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " MATCH isSingleValue " + equalKeySeriesIsSingleValue[equalKeySeriesCount] + " currentKey " + currentKey);
break;
case SPILL:
spills[spillCount] = batchIndex;
spillHashMapResultIndices[spillCount] = hashMultiSetResultCount;
spillCount++;
break;
case NOMATCH:
// VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " NOMATCH" + " currentKey " + currentKey);
break;
}
} else {
switch(saveJoinResult) {
case MATCH:
equalKeySeriesDuplicateCounts[equalKeySeriesCount]++;
allMatchs[allMatchCount++] = batchIndex;
// VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " MATCH duplicate");
break;
case SPILL:
spills[spillCount] = batchIndex;
spillHashMapResultIndices[spillCount] = hashMultiSetResultCount;
spillCount++;
break;
case NOMATCH:
// VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " NOMATCH duplicate");
break;
}
}
}
if (haveSaveKey) {
// Update our counts for the last key.
switch(saveJoinResult) {
case MATCH:
// We have extracted the count from the hash multi-set result, so we don't keep it.
equalKeySeriesCount++;
break;
case SPILL:
// We keep the hash multi-set result for its spill information.
hashMultiSetResultCount++;
break;
case NOMATCH:
break;
}
}
if (isLogDebugEnabled) {
LOG.debug(CLASS_NAME + " allMatchs " + intArrayToRangesString(allMatchs, allMatchCount) + " equalKeySeriesValueCounts " + longArrayToRangesString(equalKeySeriesValueCounts, equalKeySeriesCount) + " equalKeySeriesAllMatchIndices " + intArrayToRangesString(equalKeySeriesAllMatchIndices, equalKeySeriesCount) + " equalKeySeriesDuplicateCounts " + intArrayToRangesString(equalKeySeriesDuplicateCounts, equalKeySeriesCount) + " spills " + intArrayToRangesString(spills, spillCount) + " spillHashMapResultIndices " + intArrayToRangesString(spillHashMapResultIndices, spillCount) + " hashMapResults " + Arrays.toString(Arrays.copyOfRange(hashMultiSetResults, 0, hashMultiSetResultCount)));
}
finishInnerBigOnly(batch, allMatchCount, equalKeySeriesCount, spillCount, (VectorMapJoinHashTableResult[]) hashMultiSetResults, hashMultiSetResultCount);
}
if (batch.size > 0) {
// Forward any remaining selected rows.
forwardBigTableBatch(batch);
}
} catch (IOException e) {
throw new HiveException(e);
} catch (Exception e) {
throw new HiveException(e);
}
}
use of org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashTableResult in project hive by apache.
the class VectorMapJoinInnerBigOnlyLongOperator method process.
//---------------------------------------------------------------------------
// Process Single-Column Long Inner Big-Only Join on a vectorized row batch.
//
@Override
public void process(Object row, int tag) throws HiveException {
try {
VectorizedRowBatch batch = (VectorizedRowBatch) row;
alias = (byte) tag;
if (needCommonSetup) {
// Our one time process method initialization.
commonSetup(batch);
/*
* Initialize Single-Column Long members for this specialized class.
*/
singleJoinColumn = bigTableKeyColumnMap[0];
needCommonSetup = false;
}
if (needHashTableSetup) {
// Setup our hash table specialization. It will be the first time the process
// method is called, or after a Hybrid Grace reload.
/*
* Get our Single-Column Long hash multi-set information for this specialized class.
*/
hashMultiSet = (VectorMapJoinLongHashMultiSet) vectorMapJoinHashTable;
useMinMax = hashMultiSet.useMinMax();
if (useMinMax) {
min = hashMultiSet.min();
max = hashMultiSet.max();
}
needHashTableSetup = false;
}
batchCounter++;
// For inner joins, we may apply the filter(s) now.
for (VectorExpression ve : bigTableFilterExpressions) {
ve.evaluate(batch);
}
final int inputLogicalSize = batch.size;
if (inputLogicalSize == 0) {
if (isLogDebugEnabled) {
LOG.debug(CLASS_NAME + " batch #" + batchCounter + " empty");
}
return;
}
// Perform any key expressions. Results will go into scratch columns.
if (bigTableKeyExpressions != null) {
for (VectorExpression ve : bigTableKeyExpressions) {
ve.evaluate(batch);
}
}
/*
* Single-Column Long specific declarations.
*/
// The one join column for this specialized class.
LongColumnVector joinColVector = (LongColumnVector) batch.cols[singleJoinColumn];
long[] vector = joinColVector.vector;
/*
* Single-Column Long check for repeating.
*/
// Check single column for repeating.
boolean allKeyInputColumnsRepeating = joinColVector.isRepeating;
if (allKeyInputColumnsRepeating) {
/*
* Repeating.
*/
// All key input columns are repeating. Generate key once. Lookup once.
// Since the key is repeated, we must use entry 0 regardless of selectedInUse.
/*
* Single-Column Long specific repeated lookup.
*/
JoinUtil.JoinResult joinResult;
if (!joinColVector.noNulls && joinColVector.isNull[0]) {
joinResult = JoinUtil.JoinResult.NOMATCH;
} else {
long key = vector[0];
if (useMinMax && (key < min || key > max)) {
// Out of range for whole batch.
joinResult = JoinUtil.JoinResult.NOMATCH;
} else {
joinResult = hashMultiSet.contains(key, hashMultiSetResults[0]);
}
}
if (isLogDebugEnabled) {
LOG.debug(CLASS_NAME + " batch #" + batchCounter + " repeated joinResult " + joinResult.name());
}
finishInnerBigOnlyRepeated(batch, joinResult, hashMultiSetResults[0]);
} else {
if (isLogDebugEnabled) {
LOG.debug(CLASS_NAME + " batch #" + batchCounter + " non-repeated");
}
// We remember any matching rows in matchs / matchSize. At the end of the loop,
// selected / batch.size will represent both matching and non-matching rows for outer join.
// Only deferred rows will have been removed from selected.
int[] selected = batch.selected;
boolean selectedInUse = batch.selectedInUse;
int hashMultiSetResultCount = 0;
int allMatchCount = 0;
int equalKeySeriesCount = 0;
int spillCount = 0;
/*
* Single-Column Long specific variables.
*/
long saveKey = 0;
// We optimize performance by only looking up the first key in a series of equal keys.
boolean haveSaveKey = false;
JoinUtil.JoinResult saveJoinResult = JoinUtil.JoinResult.NOMATCH;
// Logical loop over the rows in the batch since the batch may have selected in use.
for (int logical = 0; logical < inputLogicalSize; logical++) {
int batchIndex = (selectedInUse ? selected[logical] : logical);
/*
* Single-Column Long get key.
*/
long currentKey;
boolean isNull;
if (!joinColVector.noNulls && joinColVector.isNull[batchIndex]) {
currentKey = 0;
isNull = true;
} else {
currentKey = vector[batchIndex];
isNull = false;
}
if (isNull || !haveSaveKey || currentKey != saveKey) {
if (haveSaveKey) {
// Move on with our counts.
switch(saveJoinResult) {
case MATCH:
// We have extracted the count from the hash multi-set result, so we don't keep it.
equalKeySeriesCount++;
break;
case SPILL:
// We keep the hash multi-set result for its spill information.
hashMultiSetResultCount++;
break;
case NOMATCH:
break;
}
}
if (isNull) {
saveJoinResult = JoinUtil.JoinResult.NOMATCH;
haveSaveKey = false;
} else {
// Regardless of our matching result, we keep that information to make multiple use
// of it for a possible series of equal keys.
haveSaveKey = true;
/*
* Single-Column Long specific save key.
*/
saveKey = currentKey;
if (useMinMax && (currentKey < min || currentKey > max)) {
// Key out of range for whole hash table.
saveJoinResult = JoinUtil.JoinResult.NOMATCH;
} else {
saveJoinResult = hashMultiSet.contains(currentKey, hashMultiSetResults[hashMultiSetResultCount]);
}
}
switch(saveJoinResult) {
case MATCH:
equalKeySeriesValueCounts[equalKeySeriesCount] = hashMultiSetResults[hashMultiSetResultCount].count();
equalKeySeriesAllMatchIndices[equalKeySeriesCount] = allMatchCount;
equalKeySeriesDuplicateCounts[equalKeySeriesCount] = 1;
allMatchs[allMatchCount++] = batchIndex;
// VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " MATCH isSingleValue " + equalKeySeriesIsSingleValue[equalKeySeriesCount] + " currentKey " + currentKey);
break;
case SPILL:
spills[spillCount] = batchIndex;
spillHashMapResultIndices[spillCount] = hashMultiSetResultCount;
spillCount++;
break;
case NOMATCH:
// VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " NOMATCH" + " currentKey " + currentKey);
break;
}
} else {
switch(saveJoinResult) {
case MATCH:
equalKeySeriesDuplicateCounts[equalKeySeriesCount]++;
allMatchs[allMatchCount++] = batchIndex;
// VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " MATCH duplicate");
break;
case SPILL:
spills[spillCount] = batchIndex;
spillHashMapResultIndices[spillCount] = hashMultiSetResultCount;
spillCount++;
break;
case NOMATCH:
// VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " NOMATCH duplicate");
break;
}
}
}
if (haveSaveKey) {
// Update our counts for the last key.
switch(saveJoinResult) {
case MATCH:
// We have extracted the count from the hash multi-set result, so we don't keep it.
equalKeySeriesCount++;
break;
case SPILL:
// We keep the hash multi-set result for its spill information.
hashMultiSetResultCount++;
break;
case NOMATCH:
break;
}
}
if (isLogDebugEnabled) {
LOG.debug(CLASS_NAME + " allMatchs " + intArrayToRangesString(allMatchs, allMatchCount) + " equalKeySeriesValueCounts " + longArrayToRangesString(equalKeySeriesValueCounts, equalKeySeriesCount) + " equalKeySeriesAllMatchIndices " + intArrayToRangesString(equalKeySeriesAllMatchIndices, equalKeySeriesCount) + " equalKeySeriesDuplicateCounts " + intArrayToRangesString(equalKeySeriesDuplicateCounts, equalKeySeriesCount) + " spills " + intArrayToRangesString(spills, spillCount) + " spillHashMapResultIndices " + intArrayToRangesString(spillHashMapResultIndices, spillCount) + " hashMapResults " + Arrays.toString(Arrays.copyOfRange(hashMultiSetResults, 0, hashMultiSetResultCount)));
}
finishInnerBigOnly(batch, allMatchCount, equalKeySeriesCount, spillCount, (VectorMapJoinHashTableResult[]) hashMultiSetResults, hashMultiSetResultCount);
}
if (batch.size > 0) {
// Forward any remaining selected rows.
forwardBigTableBatch(batch);
}
} catch (IOException e) {
throw new HiveException(e);
} catch (Exception e) {
throw new HiveException(e);
}
}
Aggregations