use of org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression in project hive by apache.
the class VectorMapJoinInnerBigOnlyStringOperator method process.
// ---------------------------------------------------------------------------
// Process Single-Column String Inner Big-Only Join on a vectorized row batch.
//
@Override
public void process(Object row, int tag) throws HiveException {
try {
VectorizedRowBatch batch = (VectorizedRowBatch) row;
alias = (byte) tag;
if (needCommonSetup) {
// Our one time process method initialization.
commonSetup(batch);
/*
* Initialize Single-Column String members for this specialized class.
*/
singleJoinColumn = bigTableKeyColumnMap[0];
needCommonSetup = false;
}
if (needHashTableSetup) {
// Setup our hash table specialization. It will be the first time the process
// method is called, or after a Hybrid Grace reload.
/*
* Get our Single-Column String hash multi-set information for this specialized class.
*/
hashMultiSet = (VectorMapJoinBytesHashMultiSet) vectorMapJoinHashTable;
needHashTableSetup = false;
}
batchCounter++;
// For inner joins, we may apply the filter(s) now.
for (VectorExpression ve : bigTableFilterExpressions) {
ve.evaluate(batch);
}
final int inputLogicalSize = batch.size;
if (inputLogicalSize == 0) {
if (LOG.isDebugEnabled()) {
LOG.debug(CLASS_NAME + " batch #" + batchCounter + " empty");
}
return;
}
// Perform any key expressions. Results will go into scratch columns.
if (bigTableKeyExpressions != null) {
for (VectorExpression ve : bigTableKeyExpressions) {
ve.evaluate(batch);
}
}
// We rebuild in-place the selected array with rows destine to be forwarded.
int numSel = 0;
/*
* Single-Column String specific declarations.
*/
// The one join column for this specialized class.
BytesColumnVector joinColVector = (BytesColumnVector) batch.cols[singleJoinColumn];
byte[][] vector = joinColVector.vector;
int[] start = joinColVector.start;
int[] length = joinColVector.length;
/*
* Single-Column String check for repeating.
*/
// Check single column for repeating.
boolean allKeyInputColumnsRepeating = joinColVector.isRepeating;
if (allKeyInputColumnsRepeating) {
/*
* Repeating.
*/
// All key input columns are repeating. Generate key once. Lookup once.
// Since the key is repeated, we must use entry 0 regardless of selectedInUse.
/*
* Single-Column String specific repeated lookup.
*/
JoinUtil.JoinResult joinResult;
if (!joinColVector.noNulls && joinColVector.isNull[0]) {
joinResult = JoinUtil.JoinResult.NOMATCH;
} else {
byte[] keyBytes = vector[0];
int keyStart = start[0];
int keyLength = length[0];
joinResult = hashMultiSet.contains(keyBytes, keyStart, keyLength, hashMultiSetResults[0]);
}
if (LOG.isDebugEnabled()) {
LOG.debug(CLASS_NAME + " batch #" + batchCounter + " repeated joinResult " + joinResult.name());
}
finishInnerBigOnlyRepeated(batch, joinResult, hashMultiSetResults[0]);
} else {
if (LOG.isDebugEnabled()) {
LOG.debug(CLASS_NAME + " batch #" + batchCounter + " non-repeated");
}
// We remember any matching rows in matchs / matchSize. At the end of the loop,
// selected / batch.size will represent both matching and non-matching rows for outer join.
// Only deferred rows will have been removed from selected.
int[] selected = batch.selected;
boolean selectedInUse = batch.selectedInUse;
int hashMultiSetResultCount = 0;
int allMatchCount = 0;
int equalKeySeriesCount = 0;
int spillCount = 0;
/*
* Single-Column String specific variables.
*/
int saveKeyBatchIndex = -1;
// We optimize performance by only looking up the first key in a series of equal keys.
boolean haveSaveKey = false;
JoinUtil.JoinResult saveJoinResult = JoinUtil.JoinResult.NOMATCH;
// Logical loop over the rows in the batch since the batch may have selected in use.
for (int logical = 0; logical < inputLogicalSize; logical++) {
int batchIndex = (selectedInUse ? selected[logical] : logical);
/*
* Single-Column String get key.
*/
// Implicit -- use batchIndex.
boolean isNull = !joinColVector.noNulls && joinColVector.isNull[batchIndex];
if (isNull || !haveSaveKey || StringExpr.equal(vector[saveKeyBatchIndex], start[saveKeyBatchIndex], length[saveKeyBatchIndex], vector[batchIndex], start[batchIndex], length[batchIndex]) == false) {
if (haveSaveKey) {
// Move on with our counts.
switch(saveJoinResult) {
case MATCH:
// We have extracted the count from the hash multi-set result, so we don't keep it.
equalKeySeriesCount++;
break;
case SPILL:
// We keep the hash multi-set result for its spill information.
hashMultiSetResultCount++;
break;
case NOMATCH:
break;
}
}
if (isNull) {
saveJoinResult = JoinUtil.JoinResult.NOMATCH;
haveSaveKey = false;
} else {
// Regardless of our matching result, we keep that information to make multiple use
// of it for a possible series of equal keys.
haveSaveKey = true;
/*
* Single-Column String specific save key.
*/
saveKeyBatchIndex = batchIndex;
/*
* Single-Column String specific lookup key.
*/
byte[] keyBytes = vector[batchIndex];
int keyStart = start[batchIndex];
int keyLength = length[batchIndex];
saveJoinResult = hashMultiSet.contains(keyBytes, keyStart, keyLength, hashMultiSetResults[hashMultiSetResultCount]);
}
switch(saveJoinResult) {
case MATCH:
equalKeySeriesValueCounts[equalKeySeriesCount] = hashMultiSetResults[hashMultiSetResultCount].count();
equalKeySeriesAllMatchIndices[equalKeySeriesCount] = allMatchCount;
equalKeySeriesDuplicateCounts[equalKeySeriesCount] = 1;
allMatchs[allMatchCount++] = batchIndex;
// VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " MATCH isSingleValue " + equalKeySeriesIsSingleValue[equalKeySeriesCount] + " currentKey " + currentKey);
break;
case SPILL:
spills[spillCount] = batchIndex;
spillHashMapResultIndices[spillCount] = hashMultiSetResultCount;
spillCount++;
break;
case NOMATCH:
// VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " NOMATCH" + " currentKey " + currentKey);
break;
}
} else {
switch(saveJoinResult) {
case MATCH:
equalKeySeriesDuplicateCounts[equalKeySeriesCount]++;
allMatchs[allMatchCount++] = batchIndex;
// VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " MATCH duplicate");
break;
case SPILL:
spills[spillCount] = batchIndex;
spillHashMapResultIndices[spillCount] = hashMultiSetResultCount;
spillCount++;
break;
case NOMATCH:
// VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " NOMATCH duplicate");
break;
}
}
}
if (haveSaveKey) {
// Update our counts for the last key.
switch(saveJoinResult) {
case MATCH:
// We have extracted the count from the hash multi-set result, so we don't keep it.
equalKeySeriesCount++;
break;
case SPILL:
// We keep the hash multi-set result for its spill information.
hashMultiSetResultCount++;
break;
case NOMATCH:
break;
}
}
if (LOG.isDebugEnabled()) {
LOG.debug(CLASS_NAME + " allMatchs " + intArrayToRangesString(allMatchs, allMatchCount) + " equalKeySeriesValueCounts " + longArrayToRangesString(equalKeySeriesValueCounts, equalKeySeriesCount) + " equalKeySeriesAllMatchIndices " + intArrayToRangesString(equalKeySeriesAllMatchIndices, equalKeySeriesCount) + " equalKeySeriesDuplicateCounts " + intArrayToRangesString(equalKeySeriesDuplicateCounts, equalKeySeriesCount) + " spills " + intArrayToRangesString(spills, spillCount) + " spillHashMapResultIndices " + intArrayToRangesString(spillHashMapResultIndices, spillCount) + " hashMapResults " + Arrays.toString(Arrays.copyOfRange(hashMultiSetResults, 0, hashMultiSetResultCount)));
}
finishInnerBigOnly(batch, allMatchCount, equalKeySeriesCount, spillCount, (VectorMapJoinHashTableResult[]) hashMultiSetResults, hashMultiSetResultCount);
}
if (batch.size > 0) {
// Forward any remaining selected rows.
forwardBigTableBatch(batch);
}
} catch (IOException e) {
throw new HiveException(e);
} catch (Exception e) {
throw new HiveException(e);
}
}
use of org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression in project hive by apache.
the class VectorMapJoinInnerLongOperator method process.
// ---------------------------------------------------------------------------
// Process Single-Column Long Inner Join on a vectorized row batch.
//
@Override
public void process(Object row, int tag) throws HiveException {
try {
VectorizedRowBatch batch = (VectorizedRowBatch) row;
alias = (byte) tag;
if (needCommonSetup) {
// Our one time process method initialization.
commonSetup(batch);
/*
* Initialize Single-Column Long members for this specialized class.
*/
singleJoinColumn = bigTableKeyColumnMap[0];
needCommonSetup = false;
}
if (needHashTableSetup) {
// Setup our hash table specialization. It will be the first time the process
// method is called, or after a Hybrid Grace reload.
/*
* Get our Single-Column Long hash map information for this specialized class.
*/
hashMap = (VectorMapJoinLongHashMap) vectorMapJoinHashTable;
useMinMax = hashMap.useMinMax();
if (useMinMax) {
min = hashMap.min();
max = hashMap.max();
}
needHashTableSetup = false;
}
batchCounter++;
// Do the per-batch setup for an inner join.
innerPerBatchSetup(batch);
// For inner joins, we may apply the filter(s) now.
for (VectorExpression ve : bigTableFilterExpressions) {
ve.evaluate(batch);
}
final int inputLogicalSize = batch.size;
if (inputLogicalSize == 0) {
if (LOG.isDebugEnabled()) {
LOG.debug(CLASS_NAME + " batch #" + batchCounter + " empty");
}
return;
}
// Perform any key expressions. Results will go into scratch columns.
if (bigTableKeyExpressions != null) {
for (VectorExpression ve : bigTableKeyExpressions) {
ve.evaluate(batch);
}
}
/*
* Single-Column Long specific declarations.
*/
// The one join column for this specialized class.
LongColumnVector joinColVector = (LongColumnVector) batch.cols[singleJoinColumn];
long[] vector = joinColVector.vector;
/*
* Single-Column Long check for repeating.
*/
// Check single column for repeating.
boolean allKeyInputColumnsRepeating = joinColVector.isRepeating;
if (allKeyInputColumnsRepeating) {
/*
* Repeating.
*/
// All key input columns are repeating. Generate key once. Lookup once.
// Since the key is repeated, we must use entry 0 regardless of selectedInUse.
/*
* Single-Column Long specific repeated lookup.
*/
JoinUtil.JoinResult joinResult;
if (!joinColVector.noNulls && joinColVector.isNull[0]) {
joinResult = JoinUtil.JoinResult.NOMATCH;
} else {
long key = vector[0];
if (useMinMax && (key < min || key > max)) {
// Out of range for whole batch.
joinResult = JoinUtil.JoinResult.NOMATCH;
} else {
joinResult = hashMap.lookup(key, hashMapResults[0]);
}
}
if (LOG.isDebugEnabled()) {
LOG.debug(CLASS_NAME + " batch #" + batchCounter + " repeated joinResult " + joinResult.name());
}
finishInnerRepeated(batch, joinResult, hashMapResults[0]);
} else {
if (LOG.isDebugEnabled()) {
LOG.debug(CLASS_NAME + " batch #" + batchCounter + " non-repeated");
}
// We remember any matching rows in matchs / matchSize. At the end of the loop,
// selected / batch.size will represent both matching and non-matching rows for outer join.
// Only deferred rows will have been removed from selected.
int[] selected = batch.selected;
boolean selectedInUse = batch.selectedInUse;
int hashMapResultCount = 0;
int allMatchCount = 0;
int equalKeySeriesCount = 0;
int spillCount = 0;
/*
* Single-Column Long specific variables.
*/
long saveKey = 0;
// We optimize performance by only looking up the first key in a series of equal keys.
boolean haveSaveKey = false;
JoinUtil.JoinResult saveJoinResult = JoinUtil.JoinResult.NOMATCH;
// Logical loop over the rows in the batch since the batch may have selected in use.
for (int logical = 0; logical < inputLogicalSize; logical++) {
int batchIndex = (selectedInUse ? selected[logical] : logical);
/*
* Single-Column Long get key.
*/
long currentKey;
boolean isNull;
if (!joinColVector.noNulls && joinColVector.isNull[batchIndex]) {
currentKey = 0;
isNull = true;
} else {
currentKey = vector[batchIndex];
isNull = false;
}
if (isNull || !haveSaveKey || currentKey != saveKey) {
if (haveSaveKey) {
// Move on with our counts.
switch(saveJoinResult) {
case MATCH:
hashMapResultCount++;
equalKeySeriesCount++;
break;
case SPILL:
hashMapResultCount++;
break;
case NOMATCH:
break;
}
}
if (isNull) {
saveJoinResult = JoinUtil.JoinResult.NOMATCH;
haveSaveKey = false;
} else {
// Regardless of our matching result, we keep that information to make multiple use
// of it for a possible series of equal keys.
haveSaveKey = true;
/*
* Single-Column Long specific save key.
*/
saveKey = currentKey;
if (useMinMax && (currentKey < min || currentKey > max)) {
// Key out of range for whole hash table.
saveJoinResult = JoinUtil.JoinResult.NOMATCH;
} else {
saveJoinResult = hashMap.lookup(currentKey, hashMapResults[hashMapResultCount]);
}
}
switch(saveJoinResult) {
case MATCH:
equalKeySeriesHashMapResultIndices[equalKeySeriesCount] = hashMapResultCount;
equalKeySeriesAllMatchIndices[equalKeySeriesCount] = allMatchCount;
equalKeySeriesIsSingleValue[equalKeySeriesCount] = hashMapResults[hashMapResultCount].isSingleRow();
equalKeySeriesDuplicateCounts[equalKeySeriesCount] = 1;
allMatchs[allMatchCount++] = batchIndex;
// VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " MATCH isSingleValue " + equalKeySeriesIsSingleValue[equalKeySeriesCount] + " currentKey " + currentKey);
break;
case SPILL:
spills[spillCount] = batchIndex;
spillHashMapResultIndices[spillCount] = hashMapResultCount;
spillCount++;
break;
case NOMATCH:
// VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " NOMATCH" + " currentKey " + currentKey);
break;
}
} else {
switch(saveJoinResult) {
case MATCH:
equalKeySeriesDuplicateCounts[equalKeySeriesCount]++;
allMatchs[allMatchCount++] = batchIndex;
// VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " MATCH duplicate");
break;
case SPILL:
spills[spillCount] = batchIndex;
spillHashMapResultIndices[spillCount] = hashMapResultCount;
spillCount++;
break;
case NOMATCH:
// VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " NOMATCH duplicate");
break;
}
}
}
if (haveSaveKey) {
// Update our counts for the last key.
switch(saveJoinResult) {
case MATCH:
hashMapResultCount++;
equalKeySeriesCount++;
break;
case SPILL:
hashMapResultCount++;
break;
case NOMATCH:
break;
}
}
if (LOG.isDebugEnabled()) {
LOG.debug(CLASS_NAME + " allMatchs " + intArrayToRangesString(allMatchs, allMatchCount) + " equalKeySeriesHashMapResultIndices " + intArrayToRangesString(equalKeySeriesHashMapResultIndices, equalKeySeriesCount) + " equalKeySeriesAllMatchIndices " + intArrayToRangesString(equalKeySeriesAllMatchIndices, equalKeySeriesCount) + " equalKeySeriesIsSingleValue " + Arrays.toString(Arrays.copyOfRange(equalKeySeriesIsSingleValue, 0, equalKeySeriesCount)) + " equalKeySeriesDuplicateCounts " + Arrays.toString(Arrays.copyOfRange(equalKeySeriesDuplicateCounts, 0, equalKeySeriesCount)) + " spills " + intArrayToRangesString(spills, spillCount) + " spillHashMapResultIndices " + intArrayToRangesString(spillHashMapResultIndices, spillCount) + " hashMapResults " + Arrays.toString(Arrays.copyOfRange(hashMapResults, 0, hashMapResultCount)));
}
finishInner(batch, allMatchCount, equalKeySeriesCount, spillCount, hashMapResultCount);
}
if (batch.size > 0) {
// Forward any remaining selected rows.
forwardBigTableBatch(batch);
}
} catch (IOException e) {
throw new HiveException(e);
} catch (Exception e) {
throw new HiveException(e);
}
}
use of org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression in project hive by apache.
the class VectorMapJoinInnerStringOperator method process.
// ---------------------------------------------------------------------------
// Process Single-Column String Inner Join on a vectorized row batch.
//
@Override
public void process(Object row, int tag) throws HiveException {
try {
VectorizedRowBatch batch = (VectorizedRowBatch) row;
alias = (byte) tag;
if (needCommonSetup) {
// Our one time process method initialization.
commonSetup(batch);
/*
* Initialize Single-Column String members for this specialized class.
*/
singleJoinColumn = bigTableKeyColumnMap[0];
needCommonSetup = false;
}
if (needHashTableSetup) {
// Setup our hash table specialization. It will be the first time the process
// method is called, or after a Hybrid Grace reload.
/*
* Get our Single-Column String hash map information for this specialized class.
*/
hashMap = (VectorMapJoinBytesHashMap) vectorMapJoinHashTable;
needHashTableSetup = false;
}
batchCounter++;
// Do the per-batch setup for an inner join.
innerPerBatchSetup(batch);
// For inner joins, we may apply the filter(s) now.
for (VectorExpression ve : bigTableFilterExpressions) {
ve.evaluate(batch);
}
final int inputLogicalSize = batch.size;
if (inputLogicalSize == 0) {
if (LOG.isDebugEnabled()) {
LOG.debug(CLASS_NAME + " batch #" + batchCounter + " empty");
}
return;
}
// Perform any key expressions. Results will go into scratch columns.
if (bigTableKeyExpressions != null) {
for (VectorExpression ve : bigTableKeyExpressions) {
ve.evaluate(batch);
}
}
/*
* Single-Column String specific declarations.
*/
// The one join column for this specialized class.
BytesColumnVector joinColVector = (BytesColumnVector) batch.cols[singleJoinColumn];
byte[][] vector = joinColVector.vector;
int[] start = joinColVector.start;
int[] length = joinColVector.length;
/*
* Single-Column String check for repeating.
*/
// Check single column for repeating.
boolean allKeyInputColumnsRepeating = joinColVector.isRepeating;
if (allKeyInputColumnsRepeating) {
/*
* Repeating.
*/
// All key input columns are repeating. Generate key once. Lookup once.
// Since the key is repeated, we must use entry 0 regardless of selectedInUse.
/*
* Single-Column String specific repeated lookup.
*/
JoinUtil.JoinResult joinResult;
if (!joinColVector.noNulls && joinColVector.isNull[0]) {
joinResult = JoinUtil.JoinResult.NOMATCH;
} else {
byte[] keyBytes = vector[0];
int keyStart = start[0];
int keyLength = length[0];
joinResult = hashMap.lookup(keyBytes, keyStart, keyLength, hashMapResults[0]);
}
if (LOG.isDebugEnabled()) {
LOG.debug(CLASS_NAME + " batch #" + batchCounter + " repeated joinResult " + joinResult.name());
}
finishInnerRepeated(batch, joinResult, hashMapResults[0]);
} else {
if (LOG.isDebugEnabled()) {
LOG.debug(CLASS_NAME + " batch #" + batchCounter + " non-repeated");
}
// We remember any matching rows in matchs / matchSize. At the end of the loop,
// selected / batch.size will represent both matching and non-matching rows for outer join.
// Only deferred rows will have been removed from selected.
int[] selected = batch.selected;
boolean selectedInUse = batch.selectedInUse;
int hashMapResultCount = 0;
int allMatchCount = 0;
int equalKeySeriesCount = 0;
int spillCount = 0;
/*
* Single-Column String specific variables.
*/
int saveKeyBatchIndex = -1;
// We optimize performance by only looking up the first key in a series of equal keys.
boolean haveSaveKey = false;
JoinUtil.JoinResult saveJoinResult = JoinUtil.JoinResult.NOMATCH;
// Logical loop over the rows in the batch since the batch may have selected in use.
for (int logical = 0; logical < inputLogicalSize; logical++) {
int batchIndex = (selectedInUse ? selected[logical] : logical);
/*
* Single-Column String get key.
*/
// Implicit -- use batchIndex.
boolean isNull = !joinColVector.noNulls && joinColVector.isNull[batchIndex];
if (isNull || !haveSaveKey || StringExpr.equal(vector[saveKeyBatchIndex], start[saveKeyBatchIndex], length[saveKeyBatchIndex], vector[batchIndex], start[batchIndex], length[batchIndex]) == false) {
if (haveSaveKey) {
// Move on with our counts.
switch(saveJoinResult) {
case MATCH:
hashMapResultCount++;
equalKeySeriesCount++;
break;
case SPILL:
hashMapResultCount++;
break;
case NOMATCH:
break;
}
}
if (isNull) {
saveJoinResult = JoinUtil.JoinResult.NOMATCH;
haveSaveKey = false;
} else {
// Regardless of our matching result, we keep that information to make multiple use
// of it for a possible series of equal keys.
haveSaveKey = true;
/*
* Single-Column String specific save key.
*/
saveKeyBatchIndex = batchIndex;
/*
* Single-Column String specific lookup key.
*/
byte[] keyBytes = vector[batchIndex];
int keyStart = start[batchIndex];
int keyLength = length[batchIndex];
saveJoinResult = hashMap.lookup(keyBytes, keyStart, keyLength, hashMapResults[hashMapResultCount]);
}
switch(saveJoinResult) {
case MATCH:
equalKeySeriesHashMapResultIndices[equalKeySeriesCount] = hashMapResultCount;
equalKeySeriesAllMatchIndices[equalKeySeriesCount] = allMatchCount;
equalKeySeriesIsSingleValue[equalKeySeriesCount] = hashMapResults[hashMapResultCount].isSingleRow();
equalKeySeriesDuplicateCounts[equalKeySeriesCount] = 1;
allMatchs[allMatchCount++] = batchIndex;
// VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " MATCH isSingleValue " + equalKeySeriesIsSingleValue[equalKeySeriesCount] + " currentKey " + currentKey);
break;
case SPILL:
spills[spillCount] = batchIndex;
spillHashMapResultIndices[spillCount] = hashMapResultCount;
spillCount++;
break;
case NOMATCH:
// VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " NOMATCH" + " currentKey " + currentKey);
break;
}
} else {
switch(saveJoinResult) {
case MATCH:
equalKeySeriesDuplicateCounts[equalKeySeriesCount]++;
allMatchs[allMatchCount++] = batchIndex;
// VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " MATCH duplicate");
break;
case SPILL:
spills[spillCount] = batchIndex;
spillHashMapResultIndices[spillCount] = hashMapResultCount;
spillCount++;
break;
case NOMATCH:
// VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " NOMATCH duplicate");
break;
}
}
}
if (haveSaveKey) {
// Update our counts for the last key.
switch(saveJoinResult) {
case MATCH:
hashMapResultCount++;
equalKeySeriesCount++;
break;
case SPILL:
hashMapResultCount++;
break;
case NOMATCH:
break;
}
}
if (LOG.isDebugEnabled()) {
LOG.debug(CLASS_NAME + " allMatchs " + intArrayToRangesString(allMatchs, allMatchCount) + " equalKeySeriesHashMapResultIndices " + intArrayToRangesString(equalKeySeriesHashMapResultIndices, equalKeySeriesCount) + " equalKeySeriesAllMatchIndices " + intArrayToRangesString(equalKeySeriesAllMatchIndices, equalKeySeriesCount) + " equalKeySeriesIsSingleValue " + Arrays.toString(Arrays.copyOfRange(equalKeySeriesIsSingleValue, 0, equalKeySeriesCount)) + " equalKeySeriesDuplicateCounts " + Arrays.toString(Arrays.copyOfRange(equalKeySeriesDuplicateCounts, 0, equalKeySeriesCount)) + " spills " + intArrayToRangesString(spills, spillCount) + " spillHashMapResultIndices " + intArrayToRangesString(spillHashMapResultIndices, spillCount) + " hashMapResults " + Arrays.toString(Arrays.copyOfRange(hashMapResults, 0, hashMapResultCount)));
}
finishInner(batch, allMatchCount, equalKeySeriesCount, spillCount, hashMapResultCount);
}
if (batch.size > 0) {
// Forward any remaining selected rows.
forwardBigTableBatch(batch);
}
} catch (IOException e) {
throw new HiveException(e);
} catch (Exception e) {
throw new HiveException(e);
}
}
use of org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression in project hive by apache.
the class VectorMapJoinLeftSemiLongOperator method process.
// ---------------------------------------------------------------------------
// Process Single-Column Long Left-Semi Join on a vectorized row batch.
//
@Override
public void process(Object row, int tag) throws HiveException {
try {
VectorizedRowBatch batch = (VectorizedRowBatch) row;
alias = (byte) tag;
if (needCommonSetup) {
// Our one time process method initialization.
commonSetup(batch);
/*
* Initialize Single-Column Long members for this specialized class.
*/
singleJoinColumn = bigTableKeyColumnMap[0];
needCommonSetup = false;
}
if (needHashTableSetup) {
// Setup our hash table specialization. It will be the first time the process
// method is called, or after a Hybrid Grace reload.
/*
* Get our Single-Column Long hash set information for this specialized class.
*/
hashSet = (VectorMapJoinLongHashSet) vectorMapJoinHashTable;
useMinMax = hashSet.useMinMax();
if (useMinMax) {
min = hashSet.min();
max = hashSet.max();
}
needHashTableSetup = false;
}
batchCounter++;
// For left semi joins, we may apply the filter(s) now.
for (VectorExpression ve : bigTableFilterExpressions) {
ve.evaluate(batch);
}
final int inputLogicalSize = batch.size;
if (inputLogicalSize == 0) {
if (LOG.isDebugEnabled()) {
LOG.debug(CLASS_NAME + " batch #" + batchCounter + " empty");
}
return;
}
// Perform any key expressions. Results will go into scratch columns.
if (bigTableKeyExpressions != null) {
for (VectorExpression ve : bigTableKeyExpressions) {
ve.evaluate(batch);
}
}
/*
* Single-Column Long specific declarations.
*/
// The one join column for this specialized class.
LongColumnVector joinColVector = (LongColumnVector) batch.cols[singleJoinColumn];
long[] vector = joinColVector.vector;
/*
* Single-Column Long check for repeating.
*/
// Check single column for repeating.
boolean allKeyInputColumnsRepeating = joinColVector.isRepeating;
if (allKeyInputColumnsRepeating) {
/*
* Repeating.
*/
// All key input columns are repeating. Generate key once. Lookup once.
// Since the key is repeated, we must use entry 0 regardless of selectedInUse.
/*
* Single-Column Long specific repeated lookup.
*/
JoinUtil.JoinResult joinResult;
if (!joinColVector.noNulls && joinColVector.isNull[0]) {
joinResult = JoinUtil.JoinResult.NOMATCH;
} else {
long key = vector[0];
if (useMinMax && (key < min || key > max)) {
// Out of range for whole batch.
joinResult = JoinUtil.JoinResult.NOMATCH;
} else {
joinResult = hashSet.contains(key, hashSetResults[0]);
}
}
if (LOG.isDebugEnabled()) {
LOG.debug(CLASS_NAME + " batch #" + batchCounter + " repeated joinResult " + joinResult.name());
}
finishLeftSemiRepeated(batch, joinResult, hashSetResults[0]);
} else {
if (LOG.isDebugEnabled()) {
LOG.debug(CLASS_NAME + " batch #" + batchCounter + " non-repeated");
}
// We remember any matching rows in matchs / matchSize. At the end of the loop,
// selected / batch.size will represent both matching and non-matching rows for outer join.
// Only deferred rows will have been removed from selected.
int[] selected = batch.selected;
boolean selectedInUse = batch.selectedInUse;
int hashSetResultCount = 0;
int allMatchCount = 0;
int spillCount = 0;
/*
* Single-Column Long specific variables.
*/
long saveKey = 0;
// We optimize performance by only looking up the first key in a series of equal keys.
boolean haveSaveKey = false;
JoinUtil.JoinResult saveJoinResult = JoinUtil.JoinResult.NOMATCH;
// Logical loop over the rows in the batch since the batch may have selected in use.
for (int logical = 0; logical < inputLogicalSize; logical++) {
int batchIndex = (selectedInUse ? selected[logical] : logical);
/*
* Single-Column Long get key.
*/
long currentKey;
boolean isNull;
if (!joinColVector.noNulls && joinColVector.isNull[batchIndex]) {
currentKey = 0;
isNull = true;
} else {
currentKey = vector[batchIndex];
isNull = false;
}
if (isNull || !haveSaveKey || currentKey != saveKey) {
if (haveSaveKey) {
// Move on with our counts.
switch(saveJoinResult) {
case MATCH:
// We have extracted the existence from the hash set result, so we don't keep it.
break;
case SPILL:
// We keep the hash set result for its spill information.
hashSetResultCount++;
break;
case NOMATCH:
break;
}
}
if (isNull) {
saveJoinResult = JoinUtil.JoinResult.NOMATCH;
haveSaveKey = false;
} else {
// Regardless of our matching result, we keep that information to make multiple use
// of it for a possible series of equal keys.
haveSaveKey = true;
/*
* Single-Column Long specific save key.
*/
saveKey = currentKey;
if (useMinMax && (currentKey < min || currentKey > max)) {
// Key out of range for whole hash table.
saveJoinResult = JoinUtil.JoinResult.NOMATCH;
} else {
saveJoinResult = hashSet.contains(currentKey, hashSetResults[hashSetResultCount]);
}
}
switch(saveJoinResult) {
case MATCH:
allMatchs[allMatchCount++] = batchIndex;
// VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " MATCH isSingleValue " + equalKeySeriesIsSingleValue[equalKeySeriesCount] + " currentKey " + currentKey);
break;
case SPILL:
spills[spillCount] = batchIndex;
spillHashMapResultIndices[spillCount] = hashSetResultCount;
spillCount++;
break;
case NOMATCH:
// VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " NOMATCH" + " currentKey " + currentKey);
break;
}
} else {
switch(saveJoinResult) {
case MATCH:
allMatchs[allMatchCount++] = batchIndex;
// VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " MATCH duplicate");
break;
case SPILL:
spills[spillCount] = batchIndex;
spillHashMapResultIndices[spillCount] = hashSetResultCount;
spillCount++;
break;
case NOMATCH:
// VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " NOMATCH duplicate");
break;
}
}
}
if (haveSaveKey) {
// Update our counts for the last key.
switch(saveJoinResult) {
case MATCH:
// We have extracted the existence from the hash set result, so we don't keep it.
break;
case SPILL:
// We keep the hash set result for its spill information.
hashSetResultCount++;
break;
case NOMATCH:
break;
}
}
if (LOG.isDebugEnabled()) {
LOG.debug(CLASS_NAME + " allMatchs " + intArrayToRangesString(allMatchs, allMatchCount) + " spills " + intArrayToRangesString(spills, spillCount) + " spillHashMapResultIndices " + intArrayToRangesString(spillHashMapResultIndices, spillCount) + " hashMapResults " + Arrays.toString(Arrays.copyOfRange(hashSetResults, 0, hashSetResultCount)));
}
finishLeftSemi(batch, allMatchCount, spillCount, (VectorMapJoinHashTableResult[]) hashSetResults);
}
if (batch.size > 0) {
// Forward any remaining selected rows.
forwardBigTableBatch(batch);
}
} catch (IOException e) {
throw new HiveException(e);
} catch (Exception e) {
throw new HiveException(e);
}
}
use of org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression in project hive by apache.
the class VectorMapJoinOuterLongOperator method process.
// ---------------------------------------------------------------------------
// Process Single-Column Long Outer Join on a vectorized row batch.
//
@Override
public void process(Object row, int tag) throws HiveException {
try {
VectorizedRowBatch batch = (VectorizedRowBatch) row;
alias = (byte) tag;
if (needCommonSetup) {
// Our one time process method initialization.
commonSetup(batch);
/*
* Initialize Single-Column Long members for this specialized class.
*/
singleJoinColumn = bigTableKeyColumnMap[0];
needCommonSetup = false;
}
if (needHashTableSetup) {
// Setup our hash table specialization. It will be the first time the process
// method is called, or after a Hybrid Grace reload.
/*
* Get our Single-Column Long hash map information for this specialized class.
*/
hashMap = (VectorMapJoinLongHashMap) vectorMapJoinHashTable;
useMinMax = hashMap.useMinMax();
if (useMinMax) {
min = hashMap.min();
max = hashMap.max();
}
needHashTableSetup = false;
}
batchCounter++;
final int inputLogicalSize = batch.size;
if (inputLogicalSize == 0) {
if (LOG.isDebugEnabled()) {
LOG.debug(CLASS_NAME + " batch #" + batchCounter + " empty");
}
return;
}
// Do the per-batch setup for an outer join.
outerPerBatchSetup(batch);
// For outer join, remember our input rows before ON expression filtering or before
// hash table matching so we can generate results for all rows (matching and non matching)
// later.
boolean inputSelectedInUse = batch.selectedInUse;
if (inputSelectedInUse) {
// if (!verifyMonotonicallyIncreasing(batch.selected, batch.size)) {
// throw new HiveException("batch.selected is not in sort order and unique");
// }
System.arraycopy(batch.selected, 0, inputSelected, 0, inputLogicalSize);
}
// Filtering for outer join just removes rows available for hash table matching.
boolean someRowsFilteredOut = false;
if (bigTableFilterExpressions.length > 0) {
// Since the input
for (VectorExpression ve : bigTableFilterExpressions) {
ve.evaluate(batch);
}
someRowsFilteredOut = (batch.size != inputLogicalSize);
if (LOG.isDebugEnabled()) {
if (batch.selectedInUse) {
if (inputSelectedInUse) {
LOG.debug(CLASS_NAME + " inputSelected " + intArrayToRangesString(inputSelected, inputLogicalSize) + " filtered batch.selected " + intArrayToRangesString(batch.selected, batch.size));
} else {
LOG.debug(CLASS_NAME + " inputLogicalSize " + inputLogicalSize + " filtered batch.selected " + intArrayToRangesString(batch.selected, batch.size));
}
}
}
}
// Perform any key expressions. Results will go into scratch columns.
if (bigTableKeyExpressions != null) {
for (VectorExpression ve : bigTableKeyExpressions) {
ve.evaluate(batch);
}
}
/*
* Single-Column Long specific declarations.
*/
// The one join column for this specialized class.
LongColumnVector joinColVector = (LongColumnVector) batch.cols[singleJoinColumn];
long[] vector = joinColVector.vector;
/*
* Single-Column Long check for repeating.
*/
// Check single column for repeating.
boolean allKeyInputColumnsRepeating = joinColVector.isRepeating;
if (allKeyInputColumnsRepeating) {
/*
* Repeating.
*/
// All key input columns are repeating. Generate key once. Lookup once.
// Since the key is repeated, we must use entry 0 regardless of selectedInUse.
/*
* Single-Column Long specific repeated lookup.
*/
JoinUtil.JoinResult joinResult;
if (batch.size == 0) {
// Whole repeated key batch was filtered out.
joinResult = JoinUtil.JoinResult.NOMATCH;
} else if (!joinColVector.noNulls && joinColVector.isNull[0]) {
// Any (repeated) null key column is no match for whole batch.
joinResult = JoinUtil.JoinResult.NOMATCH;
} else {
// Handle *repeated* join key, if found.
long key = vector[0];
// LOG.debug(CLASS_NAME + " repeated key " + key);
if (useMinMax && (key < min || key > max)) {
// Out of range for whole batch.
joinResult = JoinUtil.JoinResult.NOMATCH;
} else {
joinResult = hashMap.lookup(key, hashMapResults[0]);
}
}
if (LOG.isDebugEnabled()) {
LOG.debug(CLASS_NAME + " batch #" + batchCounter + " repeated joinResult " + joinResult.name());
}
finishOuterRepeated(batch, joinResult, hashMapResults[0], someRowsFilteredOut, inputSelectedInUse, inputLogicalSize);
} else {
if (LOG.isDebugEnabled()) {
LOG.debug(CLASS_NAME + " batch #" + batchCounter + " non-repeated");
}
int[] selected = batch.selected;
boolean selectedInUse = batch.selectedInUse;
int hashMapResultCount = 0;
int allMatchCount = 0;
int equalKeySeriesCount = 0;
int spillCount = 0;
boolean atLeastOneNonMatch = someRowsFilteredOut;
/*
* Single-Column Long specific variables.
*/
long saveKey = 0;
// We optimize performance by only looking up the first key in a series of equal keys.
boolean haveSaveKey = false;
JoinUtil.JoinResult saveJoinResult = JoinUtil.JoinResult.NOMATCH;
// Logical loop over the rows in the batch since the batch may have selected in use.
for (int logical = 0; logical < batch.size; logical++) {
int batchIndex = (selectedInUse ? selected[logical] : logical);
// VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, taskName + ", " + getOperatorId() + " candidate " + CLASS_NAME + " batch");
/*
* Single-Column Long outer null detection.
*/
boolean isNull = !joinColVector.noNulls && joinColVector.isNull[batchIndex];
if (isNull) {
// Have that the NULL does not interfere with the current equal key series, if there
// is one. We do not set saveJoinResult.
//
// Let a current MATCH equal key series keep going, or
// Let a current SPILL equal key series keep going, or
// Let a current NOMATCH keep not matching.
atLeastOneNonMatch = true;
// LOG.debug(CLASS_NAME + " logical " + logical + " batchIndex " + batchIndex + " NULL");
} else {
/*
* Single-Column Long outer get key.
*/
long currentKey = vector[batchIndex];
if (!haveSaveKey || currentKey != saveKey) {
if (haveSaveKey) {
// Move on with our counts.
switch(saveJoinResult) {
case MATCH:
hashMapResultCount++;
equalKeySeriesCount++;
break;
case SPILL:
hashMapResultCount++;
break;
case NOMATCH:
break;
}
}
// Regardless of our matching result, we keep that information to make multiple use
// of it for a possible series of equal keys.
haveSaveKey = true;
/*
* Single-Column Long specific save key.
*/
saveKey = currentKey;
if (useMinMax && (currentKey < min || currentKey > max)) {
// Key out of range for whole hash table.
saveJoinResult = JoinUtil.JoinResult.NOMATCH;
} else {
saveJoinResult = hashMap.lookup(currentKey, hashMapResults[hashMapResultCount]);
}
switch(saveJoinResult) {
case MATCH:
equalKeySeriesHashMapResultIndices[equalKeySeriesCount] = hashMapResultCount;
equalKeySeriesAllMatchIndices[equalKeySeriesCount] = allMatchCount;
equalKeySeriesIsSingleValue[equalKeySeriesCount] = hashMapResults[hashMapResultCount].isSingleRow();
equalKeySeriesDuplicateCounts[equalKeySeriesCount] = 1;
allMatchs[allMatchCount++] = batchIndex;
// VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " MATCH isSingleValue " + equalKeySeriesIsSingleValue[equalKeySeriesCount] + " currentKey " + currentKey);
break;
case SPILL:
spills[spillCount] = batchIndex;
spillHashMapResultIndices[spillCount] = hashMapResultCount;
spillCount++;
break;
case NOMATCH:
atLeastOneNonMatch = true;
// VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " NOMATCH" + " currentKey " + currentKey);
break;
}
} else {
switch(saveJoinResult) {
case MATCH:
equalKeySeriesDuplicateCounts[equalKeySeriesCount]++;
allMatchs[allMatchCount++] = batchIndex;
// VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " MATCH duplicate");
break;
case SPILL:
spills[spillCount] = batchIndex;
spillHashMapResultIndices[spillCount] = hashMapResultCount;
spillCount++;
break;
case NOMATCH:
// VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " NOMATCH duplicate");
break;
}
}
// if (!verifyMonotonicallyIncreasing(allMatchs, allMatchCount)) {
// throw new HiveException("allMatchs is not in sort order and unique");
// }
}
}
if (haveSaveKey) {
// Update our counts for the last key.
switch(saveJoinResult) {
case MATCH:
hashMapResultCount++;
equalKeySeriesCount++;
break;
case SPILL:
hashMapResultCount++;
break;
case NOMATCH:
break;
}
}
if (LOG.isDebugEnabled()) {
LOG.debug(CLASS_NAME + " batch #" + batchCounter + " allMatchs " + intArrayToRangesString(allMatchs, allMatchCount) + " equalKeySeriesHashMapResultIndices " + intArrayToRangesString(equalKeySeriesHashMapResultIndices, equalKeySeriesCount) + " equalKeySeriesAllMatchIndices " + intArrayToRangesString(equalKeySeriesAllMatchIndices, equalKeySeriesCount) + " equalKeySeriesIsSingleValue " + Arrays.toString(Arrays.copyOfRange(equalKeySeriesIsSingleValue, 0, equalKeySeriesCount)) + " equalKeySeriesDuplicateCounts " + Arrays.toString(Arrays.copyOfRange(equalKeySeriesDuplicateCounts, 0, equalKeySeriesCount)) + " atLeastOneNonMatch " + atLeastOneNonMatch + " inputSelectedInUse " + inputSelectedInUse + " inputLogicalSize " + inputLogicalSize + " spills " + intArrayToRangesString(spills, spillCount) + " spillHashMapResultIndices " + intArrayToRangesString(spillHashMapResultIndices, spillCount) + " hashMapResults " + Arrays.toString(Arrays.copyOfRange(hashMapResults, 0, hashMapResultCount)));
}
// We will generate results for all matching and non-matching rows.
finishOuter(batch, allMatchCount, equalKeySeriesCount, atLeastOneNonMatch, inputSelectedInUse, inputLogicalSize, spillCount, hashMapResultCount);
}
if (batch.size > 0) {
// Forward any remaining selected rows.
forwardBigTableBatch(batch);
}
} catch (IOException e) {
throw new HiveException(e);
} catch (Exception e) {
throw new HiveException(e);
}
}
Aggregations