use of org.apache.hadoop.hive.serde2.ByteStream.Output in project hive by apache.
the class GenMRSkewJoinProcessor method processSkewJoin.
/**
* Create tasks for processing skew joins. The idea is (HIVE-964) to use
* separated jobs and map-joins to handle skew joins.
* <p>
* <ul>
* <li>
* Number of mr jobs to handle skew keys is the number of table minus 1 (we
* can stream the last table, so big keys in the last table will not be a
* problem).
* <li>
* At runtime in Join, we output big keys in one table into one corresponding
* directories, and all same keys in other tables into different dirs(one for
* each table). The directories will look like:
* <ul>
* <li>
* dir-T1-bigkeys(containing big keys in T1), dir-T2-keys(containing keys
* which is big in T1),dir-T3-keys(containing keys which is big in T1), ...
* <li>
* dir-T1-keys(containing keys which is big in T2), dir-T2-bigkeys(containing
* big keys in T2),dir-T3-keys(containing keys which is big in T2), ...
* <li>
* dir-T1-keys(containing keys which is big in T3), dir-T2-keys(containing big
* keys in T3),dir-T3-bigkeys(containing keys which is big in T3), ... .....
* </ul>
* </ul>
* For each table, we launch one mapjoin job, taking the directory containing
* big keys in this table and corresponding dirs in other tables as input.
* (Actally one job for one row in the above.)
*
* <p>
* For more discussions, please check
* https://issues.apache.org/jira/browse/HIVE-964.
*/
@SuppressWarnings("unchecked")
public static void processSkewJoin(JoinOperator joinOp, Task<? extends Serializable> currTask, ParseContext parseCtx) throws SemanticException {
// now does not work with outer joins
if (!GenMRSkewJoinProcessor.skewJoinEnabled(parseCtx.getConf(), joinOp)) {
return;
}
List<Task<? extends Serializable>> children = currTask.getChildTasks();
Path baseTmpDir = parseCtx.getContext().getMRTmpPath();
JoinDesc joinDescriptor = joinOp.getConf();
Map<Byte, List<ExprNodeDesc>> joinValues = joinDescriptor.getExprs();
int numAliases = joinValues.size();
Map<Byte, Path> bigKeysDirMap = new HashMap<Byte, Path>();
Map<Byte, Map<Byte, Path>> smallKeysDirMap = new HashMap<Byte, Map<Byte, Path>>();
Map<Byte, Path> skewJoinJobResultsDir = new HashMap<Byte, Path>();
Byte[] tags = joinDescriptor.getTagOrder();
for (int i = 0; i < numAliases; i++) {
Byte alias = tags[i];
bigKeysDirMap.put(alias, getBigKeysDir(baseTmpDir, alias));
Map<Byte, Path> smallKeysMap = new HashMap<Byte, Path>();
smallKeysDirMap.put(alias, smallKeysMap);
for (Byte src2 : tags) {
if (!src2.equals(alias)) {
smallKeysMap.put(src2, getSmallKeysDir(baseTmpDir, alias, src2));
}
}
skewJoinJobResultsDir.put(alias, getBigKeysSkewJoinResultDir(baseTmpDir, alias));
}
joinDescriptor.setHandleSkewJoin(true);
joinDescriptor.setBigKeysDirMap(bigKeysDirMap);
joinDescriptor.setSmallKeysDirMap(smallKeysDirMap);
joinDescriptor.setSkewKeyDefinition(HiveConf.getIntVar(parseCtx.getConf(), HiveConf.ConfVars.HIVESKEWJOINKEY));
HashMap<Path, Task<? extends Serializable>> bigKeysDirToTaskMap = new HashMap<Path, Task<? extends Serializable>>();
List<Serializable> listWorks = new ArrayList<Serializable>();
List<Task<? extends Serializable>> listTasks = new ArrayList<Task<? extends Serializable>>();
MapredWork currPlan = (MapredWork) currTask.getWork();
TableDesc keyTblDesc = (TableDesc) currPlan.getReduceWork().getKeyDesc().clone();
List<String> joinKeys = Utilities.getColumnNames(keyTblDesc.getProperties());
List<String> joinKeyTypes = Utilities.getColumnTypes(keyTblDesc.getProperties());
Map<Byte, TableDesc> tableDescList = new HashMap<Byte, TableDesc>();
Map<Byte, RowSchema> rowSchemaList = new HashMap<Byte, RowSchema>();
Map<Byte, List<ExprNodeDesc>> newJoinValues = new HashMap<Byte, List<ExprNodeDesc>>();
Map<Byte, List<ExprNodeDesc>> newJoinKeys = new HashMap<Byte, List<ExprNodeDesc>>();
// used for create mapJoinDesc, should be in order
List<TableDesc> newJoinValueTblDesc = new ArrayList<TableDesc>();
for (Byte tag : tags) {
newJoinValueTblDesc.add(null);
}
for (int i = 0; i < numAliases; i++) {
Byte alias = tags[i];
List<ExprNodeDesc> valueCols = joinValues.get(alias);
String colNames = "";
String colTypes = "";
int columnSize = valueCols.size();
List<ExprNodeDesc> newValueExpr = new ArrayList<ExprNodeDesc>();
List<ExprNodeDesc> newKeyExpr = new ArrayList<ExprNodeDesc>();
ArrayList<ColumnInfo> columnInfos = new ArrayList<ColumnInfo>();
boolean first = true;
for (int k = 0; k < columnSize; k++) {
TypeInfo type = valueCols.get(k).getTypeInfo();
// any name, it does not matter.
String newColName = i + "_VALUE_" + k;
ColumnInfo columnInfo = new ColumnInfo(newColName, type, alias.toString(), false);
columnInfos.add(columnInfo);
newValueExpr.add(new ExprNodeColumnDesc(columnInfo));
if (!first) {
colNames = colNames + ",";
colTypes = colTypes + ",";
}
first = false;
colNames = colNames + newColName;
colTypes = colTypes + valueCols.get(k).getTypeString();
}
// we are putting join keys at last part of the spilled table
for (int k = 0; k < joinKeys.size(); k++) {
if (!first) {
colNames = colNames + ",";
colTypes = colTypes + ",";
}
first = false;
colNames = colNames + joinKeys.get(k);
colTypes = colTypes + joinKeyTypes.get(k);
ColumnInfo columnInfo = new ColumnInfo(joinKeys.get(k), TypeInfoFactory.getPrimitiveTypeInfo(joinKeyTypes.get(k)), alias.toString(), false);
columnInfos.add(columnInfo);
newKeyExpr.add(new ExprNodeColumnDesc(columnInfo));
}
newJoinValues.put(alias, newValueExpr);
newJoinKeys.put(alias, newKeyExpr);
tableDescList.put(alias, Utilities.getTableDesc(colNames, colTypes));
rowSchemaList.put(alias, new RowSchema(columnInfos));
// construct value table Desc
String valueColNames = "";
String valueColTypes = "";
first = true;
for (int k = 0; k < columnSize; k++) {
// any name, it does not matter.
String newColName = i + "_VALUE_" + k;
if (!first) {
valueColNames = valueColNames + ",";
valueColTypes = valueColTypes + ",";
}
valueColNames = valueColNames + newColName;
valueColTypes = valueColTypes + valueCols.get(k).getTypeString();
first = false;
}
newJoinValueTblDesc.set(Byte.valueOf((byte) i), Utilities.getTableDesc(valueColNames, valueColTypes));
}
joinDescriptor.setSkewKeysValuesTables(tableDescList);
joinDescriptor.setKeyTableDesc(keyTblDesc);
for (int i = 0; i < numAliases - 1; i++) {
Byte src = tags[i];
MapWork newPlan = PlanUtils.getMapRedWork().getMapWork();
// This code has been only added for testing
boolean mapperCannotSpanPartns = parseCtx.getConf().getBoolVar(HiveConf.ConfVars.HIVE_MAPPER_CANNOT_SPAN_MULTIPLE_PARTITIONS);
newPlan.setMapperCannotSpanPartns(mapperCannotSpanPartns);
MapredWork clonePlan = SerializationUtilities.clonePlan(currPlan);
Operator<? extends OperatorDesc>[] parentOps = new TableScanOperator[tags.length];
for (int k = 0; k < tags.length; k++) {
Operator<? extends OperatorDesc> ts = GenMapRedUtils.createTemporaryTableScanOperator(joinOp.getCompilationOpContext(), rowSchemaList.get((byte) k));
((TableScanOperator) ts).setTableDescSkewJoin(tableDescList.get((byte) k));
parentOps[k] = ts;
}
Operator<? extends OperatorDesc> tblScan_op = parentOps[i];
ArrayList<String> aliases = new ArrayList<String>();
String alias = src.toString().intern();
aliases.add(alias);
Path bigKeyDirPath = bigKeysDirMap.get(src);
newPlan.addPathToAlias(bigKeyDirPath, aliases);
newPlan.getAliasToWork().put(alias, tblScan_op);
PartitionDesc part = new PartitionDesc(tableDescList.get(src), null);
newPlan.addPathToPartitionInfo(bigKeyDirPath, part);
newPlan.getAliasToPartnInfo().put(alias, part);
Operator<? extends OperatorDesc> reducer = clonePlan.getReduceWork().getReducer();
assert reducer instanceof JoinOperator;
JoinOperator cloneJoinOp = (JoinOperator) reducer;
String dumpFilePrefix = "mapfile" + PlanUtils.getCountForMapJoinDumpFilePrefix();
MapJoinDesc mapJoinDescriptor = new MapJoinDesc(newJoinKeys, keyTblDesc, newJoinValues, newJoinValueTblDesc, newJoinValueTblDesc, joinDescriptor.getOutputColumnNames(), i, joinDescriptor.getConds(), joinDescriptor.getFilters(), joinDescriptor.getNoOuterJoin(), dumpFilePrefix, joinDescriptor.getMemoryMonitorInfo(), joinDescriptor.getInMemoryDataSize());
mapJoinDescriptor.setTagOrder(tags);
mapJoinDescriptor.setHandleSkewJoin(false);
mapJoinDescriptor.setNullSafes(joinDescriptor.getNullSafes());
mapJoinDescriptor.setColumnExprMap(joinDescriptor.getColumnExprMap());
MapredLocalWork localPlan = new MapredLocalWork(new LinkedHashMap<String, Operator<? extends OperatorDesc>>(), new LinkedHashMap<String, FetchWork>());
Map<Byte, Path> smallTblDirs = smallKeysDirMap.get(src);
for (int j = 0; j < numAliases; j++) {
if (j == i) {
continue;
}
Byte small_alias = tags[j];
Operator<? extends OperatorDesc> tblScan_op2 = parentOps[j];
localPlan.getAliasToWork().put(small_alias.toString(), tblScan_op2);
Path tblDir = smallTblDirs.get(small_alias);
localPlan.getAliasToFetchWork().put(small_alias.toString(), new FetchWork(tblDir, tableDescList.get(small_alias)));
}
newPlan.setMapRedLocalWork(localPlan);
// construct a map join and set it as the child operator of tblScan_op
MapJoinOperator mapJoinOp = (MapJoinOperator) OperatorFactory.getAndMakeChild(joinOp.getCompilationOpContext(), mapJoinDescriptor, (RowSchema) null, parentOps);
// change the children of the original join operator to point to the map
// join operator
List<Operator<? extends OperatorDesc>> childOps = cloneJoinOp.getChildOperators();
for (Operator<? extends OperatorDesc> childOp : childOps) {
childOp.replaceParent(cloneJoinOp, mapJoinOp);
}
mapJoinOp.setChildOperators(childOps);
HiveConf jc = new HiveConf(parseCtx.getConf(), GenMRSkewJoinProcessor.class);
newPlan.setNumMapTasks(HiveConf.getIntVar(jc, HiveConf.ConfVars.HIVESKEWJOINMAPJOINNUMMAPTASK));
newPlan.setMinSplitSize(HiveConf.getLongVar(jc, HiveConf.ConfVars.HIVESKEWJOINMAPJOINMINSPLIT));
newPlan.setInputformat(HiveInputFormat.class.getName());
MapredWork w = new MapredWork();
w.setMapWork(newPlan);
Task<? extends Serializable> skewJoinMapJoinTask = TaskFactory.get(w);
skewJoinMapJoinTask.setFetchSource(currTask.isFetchSource());
bigKeysDirToTaskMap.put(bigKeyDirPath, skewJoinMapJoinTask);
listWorks.add(skewJoinMapJoinTask.getWork());
listTasks.add(skewJoinMapJoinTask);
}
if (children != null) {
for (Task<? extends Serializable> tsk : listTasks) {
for (Task<? extends Serializable> oldChild : children) {
tsk.addDependentTask(oldChild);
}
}
currTask.setChildTasks(new ArrayList<Task<? extends Serializable>>());
for (Task<? extends Serializable> oldChild : children) {
oldChild.getParentTasks().remove(currTask);
}
listTasks.addAll(children);
}
ConditionalResolverSkewJoinCtx context = new ConditionalResolverSkewJoinCtx(bigKeysDirToTaskMap, children);
ConditionalWork cndWork = new ConditionalWork(listWorks);
ConditionalTask cndTsk = (ConditionalTask) TaskFactory.get(cndWork);
cndTsk.setListTasks(listTasks);
cndTsk.setResolver(new ConditionalResolverSkewJoin());
cndTsk.setResolverCtx(context);
currTask.setChildTasks(new ArrayList<Task<? extends Serializable>>());
currTask.addDependentTask(cndTsk);
return;
}
use of org.apache.hadoop.hive.serde2.ByteStream.Output in project hive by apache.
the class VectorMapJoinInnerBigOnlyMultiKeyOperator method process.
// ---------------------------------------------------------------------------
// Process Multi-Key Inner Big-Only Join on a vectorized row batch.
//
@Override
public void process(Object row, int tag) throws HiveException {
try {
VectorizedRowBatch batch = (VectorizedRowBatch) row;
alias = (byte) tag;
if (needCommonSetup) {
// Our one time process method initialization.
commonSetup(batch);
/*
* Initialize Multi-Key members for this specialized class.
*/
keyVectorSerializeWrite = new VectorSerializeRow(new BinarySortableSerializeWrite(bigTableKeyColumnMap.length));
keyVectorSerializeWrite.init(bigTableKeyTypeInfos, bigTableKeyColumnMap);
currentKeyOutput = new Output();
saveKeyOutput = new Output();
needCommonSetup = false;
}
if (needHashTableSetup) {
// Setup our hash table specialization. It will be the first time the process
// method is called, or after a Hybrid Grace reload.
/*
* Get our Multi-Key hash multi-set information for this specialized class.
*/
hashMultiSet = (VectorMapJoinBytesHashMultiSet) vectorMapJoinHashTable;
needHashTableSetup = false;
}
batchCounter++;
// For inner joins, we may apply the filter(s) now.
for (VectorExpression ve : bigTableFilterExpressions) {
ve.evaluate(batch);
}
final int inputLogicalSize = batch.size;
if (inputLogicalSize == 0) {
if (LOG.isDebugEnabled()) {
LOG.debug(CLASS_NAME + " batch #" + batchCounter + " empty");
}
return;
}
// Perform any key expressions. Results will go into scratch columns.
if (bigTableKeyExpressions != null) {
for (VectorExpression ve : bigTableKeyExpressions) {
ve.evaluate(batch);
}
}
/*
* Multi-Key specific declarations.
*/
// None.
/*
* Multi-Key check for repeating.
*/
// If all BigTable input columns to key expressions are isRepeating, then
// calculate key once; lookup once.
boolean allKeyInputColumnsRepeating;
if (bigTableKeyColumnMap.length == 0) {
allKeyInputColumnsRepeating = false;
} else {
allKeyInputColumnsRepeating = true;
for (int i = 0; i < bigTableKeyColumnMap.length; i++) {
if (!batch.cols[bigTableKeyColumnMap[i]].isRepeating) {
allKeyInputColumnsRepeating = false;
break;
}
}
}
if (allKeyInputColumnsRepeating) {
/*
* Repeating.
*/
// All key input columns are repeating. Generate key once. Lookup once.
// Since the key is repeated, we must use entry 0 regardless of selectedInUse.
/*
* Multi-Key specific repeated lookup.
*/
keyVectorSerializeWrite.setOutput(currentKeyOutput);
keyVectorSerializeWrite.serializeWrite(batch, 0);
JoinUtil.JoinResult joinResult;
if (keyVectorSerializeWrite.getHasAnyNulls()) {
joinResult = JoinUtil.JoinResult.NOMATCH;
} else {
byte[] keyBytes = currentKeyOutput.getData();
int keyLength = currentKeyOutput.getLength();
joinResult = hashMultiSet.contains(keyBytes, 0, keyLength, hashMultiSetResults[0]);
}
if (LOG.isDebugEnabled()) {
LOG.debug(CLASS_NAME + " batch #" + batchCounter + " repeated joinResult " + joinResult.name());
}
finishInnerBigOnlyRepeated(batch, joinResult, hashMultiSetResults[0]);
} else {
if (LOG.isDebugEnabled()) {
LOG.debug(CLASS_NAME + " batch #" + batchCounter + " non-repeated");
}
// We remember any matching rows in matchs / matchSize. At the end of the loop,
// selected / batch.size will represent both matching and non-matching rows for outer join.
// Only deferred rows will have been removed from selected.
int[] selected = batch.selected;
boolean selectedInUse = batch.selectedInUse;
int hashMultiSetResultCount = 0;
int allMatchCount = 0;
int equalKeySeriesCount = 0;
int spillCount = 0;
/*
* Multi-Key specific variables.
*/
Output temp;
// We optimize performance by only looking up the first key in a series of equal keys.
boolean haveSaveKey = false;
JoinUtil.JoinResult saveJoinResult = JoinUtil.JoinResult.NOMATCH;
// Logical loop over the rows in the batch since the batch may have selected in use.
for (int logical = 0; logical < inputLogicalSize; logical++) {
int batchIndex = (selectedInUse ? selected[logical] : logical);
/*
* Multi-Key get key.
*/
// Generate binary sortable key for current row in vectorized row batch.
keyVectorSerializeWrite.setOutput(currentKeyOutput);
keyVectorSerializeWrite.serializeWrite(batch, batchIndex);
boolean isAnyNulls = keyVectorSerializeWrite.getHasAnyNulls();
if (isAnyNulls || !haveSaveKey || !saveKeyOutput.arraysEquals(currentKeyOutput)) {
if (haveSaveKey) {
// Move on with our counts.
switch(saveJoinResult) {
case MATCH:
// We have extracted the count from the hash multi-set result, so we don't keep it.
equalKeySeriesCount++;
break;
case SPILL:
// We keep the hash multi-set result for its spill information.
hashMultiSetResultCount++;
break;
case NOMATCH:
break;
}
}
if (isAnyNulls) {
saveJoinResult = JoinUtil.JoinResult.NOMATCH;
haveSaveKey = false;
} else {
// Regardless of our matching result, we keep that information to make multiple use
// of it for a possible series of equal keys.
haveSaveKey = true;
/*
* Multi-Key specific save key.
*/
temp = saveKeyOutput;
saveKeyOutput = currentKeyOutput;
currentKeyOutput = temp;
/*
* Single-Column Long specific lookup key.
*/
byte[] keyBytes = saveKeyOutput.getData();
int keyLength = saveKeyOutput.getLength();
saveJoinResult = hashMultiSet.contains(keyBytes, 0, keyLength, hashMultiSetResults[hashMultiSetResultCount]);
}
switch(saveJoinResult) {
case MATCH:
equalKeySeriesValueCounts[equalKeySeriesCount] = hashMultiSetResults[hashMultiSetResultCount].count();
equalKeySeriesAllMatchIndices[equalKeySeriesCount] = allMatchCount;
equalKeySeriesDuplicateCounts[equalKeySeriesCount] = 1;
allMatchs[allMatchCount++] = batchIndex;
// VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " MATCH isSingleValue " + equalKeySeriesIsSingleValue[equalKeySeriesCount] + " currentKey " + currentKey);
break;
case SPILL:
spills[spillCount] = batchIndex;
spillHashMapResultIndices[spillCount] = hashMultiSetResultCount;
spillCount++;
break;
case NOMATCH:
// VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " NOMATCH" + " currentKey " + currentKey);
break;
}
} else {
switch(saveJoinResult) {
case MATCH:
equalKeySeriesDuplicateCounts[equalKeySeriesCount]++;
allMatchs[allMatchCount++] = batchIndex;
// VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " MATCH duplicate");
break;
case SPILL:
spills[spillCount] = batchIndex;
spillHashMapResultIndices[spillCount] = hashMultiSetResultCount;
spillCount++;
break;
case NOMATCH:
// VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " NOMATCH duplicate");
break;
}
}
}
if (haveSaveKey) {
// Update our counts for the last key.
switch(saveJoinResult) {
case MATCH:
// We have extracted the count from the hash multi-set result, so we don't keep it.
equalKeySeriesCount++;
break;
case SPILL:
// We keep the hash multi-set result for its spill information.
hashMultiSetResultCount++;
break;
case NOMATCH:
break;
}
}
if (LOG.isDebugEnabled()) {
LOG.debug(CLASS_NAME + " allMatchs " + intArrayToRangesString(allMatchs, allMatchCount) + " equalKeySeriesValueCounts " + longArrayToRangesString(equalKeySeriesValueCounts, equalKeySeriesCount) + " equalKeySeriesAllMatchIndices " + intArrayToRangesString(equalKeySeriesAllMatchIndices, equalKeySeriesCount) + " equalKeySeriesDuplicateCounts " + intArrayToRangesString(equalKeySeriesDuplicateCounts, equalKeySeriesCount) + " spills " + intArrayToRangesString(spills, spillCount) + " spillHashMapResultIndices " + intArrayToRangesString(spillHashMapResultIndices, spillCount) + " hashMapResults " + Arrays.toString(Arrays.copyOfRange(hashMultiSetResults, 0, hashMultiSetResultCount)));
}
finishInnerBigOnly(batch, allMatchCount, equalKeySeriesCount, spillCount, (VectorMapJoinHashTableResult[]) hashMultiSetResults, hashMultiSetResultCount);
}
if (batch.size > 0) {
// Forward any remaining selected rows.
forwardBigTableBatch(batch);
}
} catch (IOException e) {
throw new HiveException(e);
} catch (Exception e) {
throw new HiveException(e);
}
}
use of org.apache.hadoop.hive.serde2.ByteStream.Output in project hive by apache.
the class VectorMapJoinInnerMultiKeyOperator method process.
// ---------------------------------------------------------------------------
// Process Multi-Key Inner Join on a vectorized row batch.
//
@Override
public void process(Object row, int tag) throws HiveException {
try {
VectorizedRowBatch batch = (VectorizedRowBatch) row;
alias = (byte) tag;
if (needCommonSetup) {
// Our one time process method initialization.
commonSetup(batch);
/*
* Initialize Multi-Key members for this specialized class.
*/
keyVectorSerializeWrite = new VectorSerializeRow(new BinarySortableSerializeWrite(bigTableKeyColumnMap.length));
keyVectorSerializeWrite.init(bigTableKeyTypeInfos, bigTableKeyColumnMap);
currentKeyOutput = new Output();
saveKeyOutput = new Output();
needCommonSetup = false;
}
if (needHashTableSetup) {
// Setup our hash table specialization. It will be the first time the process
// method is called, or after a Hybrid Grace reload.
/*
* Get our Multi-Key hash map information for this specialized class.
*/
hashMap = (VectorMapJoinBytesHashMap) vectorMapJoinHashTable;
needHashTableSetup = false;
}
batchCounter++;
// Do the per-batch setup for an inner join.
innerPerBatchSetup(batch);
// For inner joins, we may apply the filter(s) now.
for (VectorExpression ve : bigTableFilterExpressions) {
ve.evaluate(batch);
}
final int inputLogicalSize = batch.size;
if (inputLogicalSize == 0) {
if (LOG.isDebugEnabled()) {
LOG.debug(CLASS_NAME + " batch #" + batchCounter + " empty");
}
return;
}
// Perform any key expressions. Results will go into scratch columns.
if (bigTableKeyExpressions != null) {
for (VectorExpression ve : bigTableKeyExpressions) {
ve.evaluate(batch);
}
}
/*
* Multi-Key specific declarations.
*/
// None.
/*
* Multi-Key check for repeating.
*/
// If all BigTable input columns to key expressions are isRepeating, then
// calculate key once; lookup once.
boolean allKeyInputColumnsRepeating;
if (bigTableKeyColumnMap.length == 0) {
allKeyInputColumnsRepeating = false;
} else {
allKeyInputColumnsRepeating = true;
for (int i = 0; i < bigTableKeyColumnMap.length; i++) {
if (!batch.cols[bigTableKeyColumnMap[i]].isRepeating) {
allKeyInputColumnsRepeating = false;
break;
}
}
}
if (allKeyInputColumnsRepeating) {
/*
* Repeating.
*/
// All key input columns are repeating. Generate key once. Lookup once.
// Since the key is repeated, we must use entry 0 regardless of selectedInUse.
/*
* Multi-Key specific repeated lookup.
*/
keyVectorSerializeWrite.setOutput(currentKeyOutput);
keyVectorSerializeWrite.serializeWrite(batch, 0);
JoinUtil.JoinResult joinResult;
if (keyVectorSerializeWrite.getHasAnyNulls()) {
joinResult = JoinUtil.JoinResult.NOMATCH;
} else {
byte[] keyBytes = currentKeyOutput.getData();
int keyLength = currentKeyOutput.getLength();
joinResult = hashMap.lookup(keyBytes, 0, keyLength, hashMapResults[0]);
}
if (LOG.isDebugEnabled()) {
LOG.debug(CLASS_NAME + " batch #" + batchCounter + " repeated joinResult " + joinResult.name());
}
finishInnerRepeated(batch, joinResult, hashMapResults[0]);
} else {
if (LOG.isDebugEnabled()) {
LOG.debug(CLASS_NAME + " batch #" + batchCounter + " non-repeated");
}
// We remember any matching rows in matchs / matchSize. At the end of the loop,
// selected / batch.size will represent both matching and non-matching rows for outer join.
// Only deferred rows will have been removed from selected.
int[] selected = batch.selected;
boolean selectedInUse = batch.selectedInUse;
int hashMapResultCount = 0;
int allMatchCount = 0;
int equalKeySeriesCount = 0;
int spillCount = 0;
/*
* Multi-Key specific variables.
*/
Output temp;
// We optimize performance by only looking up the first key in a series of equal keys.
boolean haveSaveKey = false;
JoinUtil.JoinResult saveJoinResult = JoinUtil.JoinResult.NOMATCH;
// Logical loop over the rows in the batch since the batch may have selected in use.
for (int logical = 0; logical < inputLogicalSize; logical++) {
int batchIndex = (selectedInUse ? selected[logical] : logical);
/*
* Multi-Key get key.
*/
// Generate binary sortable key for current row in vectorized row batch.
keyVectorSerializeWrite.setOutput(currentKeyOutput);
keyVectorSerializeWrite.serializeWrite(batch, batchIndex);
boolean isAnyNull = keyVectorSerializeWrite.getHasAnyNulls();
if (isAnyNull || !haveSaveKey || !saveKeyOutput.arraysEquals(currentKeyOutput)) {
if (haveSaveKey) {
// Move on with our counts.
switch(saveJoinResult) {
case MATCH:
hashMapResultCount++;
equalKeySeriesCount++;
break;
case SPILL:
hashMapResultCount++;
break;
case NOMATCH:
break;
}
}
if (isAnyNull) {
saveJoinResult = JoinUtil.JoinResult.NOMATCH;
haveSaveKey = false;
} else {
// Regardless of our matching result, we keep that information to make multiple use
// of it for a possible series of equal keys.
haveSaveKey = true;
/*
* Multi-Key specific save key.
*/
temp = saveKeyOutput;
saveKeyOutput = currentKeyOutput;
currentKeyOutput = temp;
/*
* Multi-Key specific lookup key.
*/
byte[] keyBytes = saveKeyOutput.getData();
int keyLength = saveKeyOutput.getLength();
saveJoinResult = hashMap.lookup(keyBytes, 0, keyLength, hashMapResults[hashMapResultCount]);
}
switch(saveJoinResult) {
case MATCH:
equalKeySeriesHashMapResultIndices[equalKeySeriesCount] = hashMapResultCount;
equalKeySeriesAllMatchIndices[equalKeySeriesCount] = allMatchCount;
equalKeySeriesIsSingleValue[equalKeySeriesCount] = hashMapResults[hashMapResultCount].isSingleRow();
equalKeySeriesDuplicateCounts[equalKeySeriesCount] = 1;
allMatchs[allMatchCount++] = batchIndex;
// VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " MATCH isSingleValue " + equalKeySeriesIsSingleValue[equalKeySeriesCount] + " currentKey " + currentKey);
break;
case SPILL:
spills[spillCount] = batchIndex;
spillHashMapResultIndices[spillCount] = hashMapResultCount;
spillCount++;
break;
case NOMATCH:
// VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " NOMATCH" + " currentKey " + currentKey);
break;
}
} else {
switch(saveJoinResult) {
case MATCH:
equalKeySeriesDuplicateCounts[equalKeySeriesCount]++;
allMatchs[allMatchCount++] = batchIndex;
// VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " MATCH duplicate");
break;
case SPILL:
spills[spillCount] = batchIndex;
spillHashMapResultIndices[spillCount] = hashMapResultCount;
spillCount++;
break;
case NOMATCH:
// VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " NOMATCH duplicate");
break;
}
}
}
if (haveSaveKey) {
// Update our counts for the last key.
switch(saveJoinResult) {
case MATCH:
hashMapResultCount++;
equalKeySeriesCount++;
break;
case SPILL:
hashMapResultCount++;
break;
case NOMATCH:
break;
}
}
if (LOG.isDebugEnabled()) {
LOG.debug(CLASS_NAME + " allMatchs " + intArrayToRangesString(allMatchs, allMatchCount) + " equalKeySeriesHashMapResultIndices " + intArrayToRangesString(equalKeySeriesHashMapResultIndices, equalKeySeriesCount) + " equalKeySeriesAllMatchIndices " + intArrayToRangesString(equalKeySeriesAllMatchIndices, equalKeySeriesCount) + " equalKeySeriesIsSingleValue " + Arrays.toString(Arrays.copyOfRange(equalKeySeriesIsSingleValue, 0, equalKeySeriesCount)) + " equalKeySeriesDuplicateCounts " + Arrays.toString(Arrays.copyOfRange(equalKeySeriesDuplicateCounts, 0, equalKeySeriesCount)) + " spills " + intArrayToRangesString(spills, spillCount) + " spillHashMapResultIndices " + intArrayToRangesString(spillHashMapResultIndices, spillCount) + " hashMapResults " + Arrays.toString(Arrays.copyOfRange(hashMapResults, 0, hashMapResultCount)));
}
finishInner(batch, allMatchCount, equalKeySeriesCount, spillCount, hashMapResultCount);
}
if (batch.size > 0) {
// Forward any remaining selected rows.
forwardBigTableBatch(batch);
}
} catch (IOException e) {
throw new HiveException(e);
} catch (Exception e) {
throw new HiveException(e);
}
}
use of org.apache.hadoop.hive.serde2.ByteStream.Output in project hive by apache.
the class VectorMapJoinLeftSemiMultiKeyOperator method process.
// ---------------------------------------------------------------------------
// Process Multi-Key Left-Semi Join on a vectorized row batch.
//
@Override
public void process(Object row, int tag) throws HiveException {
try {
VectorizedRowBatch batch = (VectorizedRowBatch) row;
alias = (byte) tag;
if (needCommonSetup) {
// Our one time process method initialization.
commonSetup(batch);
/*
* Initialize Multi-Key members for this specialized class.
*/
keyVectorSerializeWrite = new VectorSerializeRow(new BinarySortableSerializeWrite(bigTableKeyColumnMap.length));
keyVectorSerializeWrite.init(bigTableKeyTypeInfos, bigTableKeyColumnMap);
currentKeyOutput = new Output();
saveKeyOutput = new Output();
needCommonSetup = false;
}
if (needHashTableSetup) {
// Setup our hash table specialization. It will be the first time the process
// method is called, or after a Hybrid Grace reload.
/*
* Get our Multi-Key hash set information for this specialized class.
*/
hashSet = (VectorMapJoinBytesHashSet) vectorMapJoinHashTable;
needHashTableSetup = false;
}
batchCounter++;
// For left semi joins, we may apply the filter(s) now.
for (VectorExpression ve : bigTableFilterExpressions) {
ve.evaluate(batch);
}
final int inputLogicalSize = batch.size;
if (inputLogicalSize == 0) {
if (LOG.isDebugEnabled()) {
LOG.debug(CLASS_NAME + " batch #" + batchCounter + " empty");
}
return;
}
// Perform any key expressions. Results will go into scratch columns.
if (bigTableKeyExpressions != null) {
for (VectorExpression ve : bigTableKeyExpressions) {
ve.evaluate(batch);
}
}
/*
* Multi-Key specific declarations.
*/
// None.
/*
* Multi-Key Long check for repeating.
*/
// If all BigTable input columns to key expressions are isRepeating, then
// calculate key once; lookup once.
boolean allKeyInputColumnsRepeating;
if (bigTableKeyColumnMap.length == 0) {
allKeyInputColumnsRepeating = false;
} else {
allKeyInputColumnsRepeating = true;
for (int i = 0; i < bigTableKeyColumnMap.length; i++) {
if (!batch.cols[bigTableKeyColumnMap[i]].isRepeating) {
allKeyInputColumnsRepeating = false;
break;
}
}
}
if (allKeyInputColumnsRepeating) {
/*
* Repeating.
*/
// All key input columns are repeating. Generate key once. Lookup once.
// Since the key is repeated, we must use entry 0 regardless of selectedInUse.
/*
* Multi-Key specific repeated lookup.
*/
keyVectorSerializeWrite.setOutput(currentKeyOutput);
keyVectorSerializeWrite.serializeWrite(batch, 0);
JoinUtil.JoinResult joinResult;
if (keyVectorSerializeWrite.getHasAnyNulls()) {
joinResult = JoinUtil.JoinResult.NOMATCH;
} else {
byte[] keyBytes = currentKeyOutput.getData();
int keyLength = currentKeyOutput.getLength();
// LOG.debug(CLASS_NAME + " processOp all " + displayBytes(keyBytes, 0, keyLength));
joinResult = hashSet.contains(keyBytes, 0, keyLength, hashSetResults[0]);
}
if (LOG.isDebugEnabled()) {
LOG.debug(CLASS_NAME + " batch #" + batchCounter + " repeated joinResult " + joinResult.name());
}
finishLeftSemiRepeated(batch, joinResult, hashSetResults[0]);
} else {
if (LOG.isDebugEnabled()) {
LOG.debug(CLASS_NAME + " batch #" + batchCounter + " non-repeated");
}
// We remember any matching rows in matchs / matchSize. At the end of the loop,
// selected / batch.size will represent both matching and non-matching rows for outer join.
// Only deferred rows will have been removed from selected.
int[] selected = batch.selected;
boolean selectedInUse = batch.selectedInUse;
int hashSetResultCount = 0;
int allMatchCount = 0;
int spillCount = 0;
/*
* Multi-Key specific variables.
*/
Output temp;
// We optimize performance by only looking up the first key in a series of equal keys.
boolean haveSaveKey = false;
JoinUtil.JoinResult saveJoinResult = JoinUtil.JoinResult.NOMATCH;
// Logical loop over the rows in the batch since the batch may have selected in use.
for (int logical = 0; logical < inputLogicalSize; logical++) {
int batchIndex = (selectedInUse ? selected[logical] : logical);
/*
* Multi-Key get key.
*/
// Generate binary sortable key for current row in vectorized row batch.
keyVectorSerializeWrite.setOutput(currentKeyOutput);
keyVectorSerializeWrite.serializeWrite(batch, batchIndex);
boolean isAnyNull = keyVectorSerializeWrite.getHasAnyNulls();
if (isAnyNull || !haveSaveKey || !saveKeyOutput.arraysEquals(currentKeyOutput)) {
if (haveSaveKey) {
// Move on with our counts.
switch(saveJoinResult) {
case MATCH:
// We have extracted the existence from the hash set result, so we don't keep it.
break;
case SPILL:
// We keep the hash set result for its spill information.
hashSetResultCount++;
break;
case NOMATCH:
break;
}
}
if (isAnyNull) {
saveJoinResult = JoinUtil.JoinResult.NOMATCH;
haveSaveKey = false;
} else {
// Regardless of our matching result, we keep that information to make multiple use
// of it for a possible series of equal keys.
haveSaveKey = true;
/*
* Multi-Key specific save key and lookup.
*/
temp = saveKeyOutput;
saveKeyOutput = currentKeyOutput;
currentKeyOutput = temp;
/*
* Multi-key specific lookup key.
*/
byte[] keyBytes = saveKeyOutput.getData();
int keyLength = saveKeyOutput.getLength();
saveJoinResult = hashSet.contains(keyBytes, 0, keyLength, hashSetResults[hashSetResultCount]);
}
switch(saveJoinResult) {
case MATCH:
allMatchs[allMatchCount++] = batchIndex;
// VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " MATCH isSingleValue " + equalKeySeriesIsSingleValue[equalKeySeriesCount] + " currentKey " + currentKey);
break;
case SPILL:
spills[spillCount] = batchIndex;
spillHashMapResultIndices[spillCount] = hashSetResultCount;
spillCount++;
break;
case NOMATCH:
// VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " NOMATCH" + " currentKey " + currentKey);
break;
}
} else {
switch(saveJoinResult) {
case MATCH:
allMatchs[allMatchCount++] = batchIndex;
// VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " MATCH duplicate");
break;
case SPILL:
spills[spillCount] = batchIndex;
spillHashMapResultIndices[spillCount] = hashSetResultCount;
spillCount++;
break;
case NOMATCH:
// VectorizedBatchUtil.debugDisplayOneRow(batch, batchIndex, CLASS_NAME + " NOMATCH duplicate");
break;
}
}
}
if (haveSaveKey) {
// Update our counts for the last key.
switch(saveJoinResult) {
case MATCH:
// We have extracted the existence from the hash set result, so we don't keep it.
break;
case SPILL:
// We keep the hash set result for its spill information.
hashSetResultCount++;
break;
case NOMATCH:
break;
}
}
if (LOG.isDebugEnabled()) {
LOG.debug(CLASS_NAME + " allMatchs " + intArrayToRangesString(allMatchs, allMatchCount) + " spills " + intArrayToRangesString(spills, spillCount) + " spillHashMapResultIndices " + intArrayToRangesString(spillHashMapResultIndices, spillCount) + " hashMapResults " + Arrays.toString(Arrays.copyOfRange(hashSetResults, 0, hashSetResultCount)));
}
finishLeftSemi(batch, allMatchCount, spillCount, (VectorMapJoinHashTableResult[]) hashSetResults);
}
if (batch.size > 0) {
// Forward any remaining selected rows.
forwardBigTableBatch(batch);
}
} catch (IOException e) {
throw new HiveException(e);
} catch (Exception e) {
throw new HiveException(e);
}
}
use of org.apache.hadoop.hive.serde2.ByteStream.Output in project flink by apache.
the class HiveParserCalcitePlanner method genSelectForWindowing.
private RelNode genSelectForWindowing(HiveParserQB qb, RelNode srcRel, HashSet<ColumnInfo> newColumns) throws SemanticException {
HiveParserWindowingSpec wSpec = !qb.getAllWindowingSpecs().isEmpty() ? qb.getAllWindowingSpecs().values().iterator().next() : null;
if (wSpec == null) {
return null;
}
// 1. Get valid Window Function Spec
wSpec.validateAndMakeEffective();
List<HiveParserWindowingSpec.WindowExpressionSpec> windowExpressions = wSpec.getWindowExpressions();
if (windowExpressions == null || windowExpressions.isEmpty()) {
return null;
}
HiveParserRowResolver inputRR = relToRowResolver.get(srcRel);
// 2. Get RexNodes for original Projections from below
List<RexNode> projsForWindowSelOp = new ArrayList<>(HiveParserUtils.getProjsFromBelowAsInputRef(srcRel));
// 3. Construct new Row Resolver with everything from below.
HiveParserRowResolver outRR = new HiveParserRowResolver();
if (!HiveParserRowResolver.add(outRR, inputRR)) {
LOG.warn("Duplicates detected when adding columns to RR: see previous message");
}
// 4. Walk through Window Expressions & Construct RexNodes for those. Update out_rwsch
final HiveParserQBParseInfo qbp = qb.getParseInfo();
final String selClauseName = qbp.getClauseNames().iterator().next();
final boolean cubeRollupGrpSetPresent = (!qbp.getDestRollups().isEmpty() || !qbp.getDestGroupingSets().isEmpty() || !qbp.getDestCubes().isEmpty());
for (HiveParserWindowingSpec.WindowExpressionSpec winExprSpec : windowExpressions) {
if (!qbp.getDestToGroupBy().isEmpty()) {
// Special handling of grouping function
winExprSpec.setExpression(rewriteGroupingFunctionAST(getGroupByForClause(qbp, selClauseName), winExprSpec.getExpression(), !cubeRollupGrpSetPresent));
}
if (outRR.getExpression(winExprSpec.getExpression()) == null) {
Pair<RexNode, TypeInfo> rexAndType = getWindowRexAndType(winExprSpec, srcRel);
projsForWindowSelOp.add(rexAndType.getKey());
// 6.2.2 Update Output Row Schema
ColumnInfo oColInfo = new ColumnInfo(getColumnInternalName(projsForWindowSelOp.size()), rexAndType.getValue(), null, false);
outRR.putExpression(winExprSpec.getExpression(), oColInfo);
newColumns.add(oColInfo);
}
}
return genSelectRelNode(projsForWindowSelOp, outRR, srcRel, windowExpressions);
}
Aggregations