use of org.apache.hadoop.hive.ql.plan.JoinCondDesc in project hive by apache.
the class MapJoinProcessor method getMapJoinDesc.
public static MapJoinDesc getMapJoinDesc(HiveConf hconf, JoinOperator op, boolean leftInputJoin, String[] baseSrc, List<String> mapAliases, int mapJoinPos, boolean noCheckOuterJoin, boolean adjustParentsChildren) throws SemanticException {
JoinDesc desc = op.getConf();
JoinCondDesc[] condns = desc.getConds();
Byte[] tagOrder = desc.getTagOrder();
// outer join cannot be performed on a table which is being cached
if (!noCheckOuterJoin) {
if (checkMapJoin(mapJoinPos, condns) < 0) {
throw new SemanticException(ErrorMsg.NO_OUTER_MAPJOIN.getMsg());
}
}
Map<String, ExprNodeDesc> colExprMap = op.getColumnExprMap();
List<ColumnInfo> schema = new ArrayList<ColumnInfo>(op.getSchema().getSignature());
Map<Byte, List<ExprNodeDesc>> valueExprs = op.getConf().getExprs();
Map<Byte, List<ExprNodeDesc>> newValueExprs = new HashMap<Byte, List<ExprNodeDesc>>();
ObjectPair<List<ReduceSinkOperator>, Map<Byte, List<ExprNodeDesc>>> pair = getKeys(leftInputJoin, baseSrc, op);
List<ReduceSinkOperator> oldReduceSinkParentOps = pair.getFirst();
for (Map.Entry<Byte, List<ExprNodeDesc>> entry : valueExprs.entrySet()) {
byte tag = entry.getKey();
Operator<?> terminal = oldReduceSinkParentOps.get(tag);
List<ExprNodeDesc> values = entry.getValue();
List<ExprNodeDesc> newValues = ExprNodeDescUtils.backtrack(values, op, terminal);
newValueExprs.put(tag, newValues);
for (int i = 0; i < schema.size(); i++) {
ColumnInfo column = schema.get(i);
if (column == null) {
continue;
}
ExprNodeDesc expr = colExprMap.get(column.getInternalName());
int index = ExprNodeDescUtils.indexOf(expr, values);
if (index >= 0) {
colExprMap.put(column.getInternalName(), newValues.get(index));
schema.set(i, null);
}
}
}
// rewrite value index for mapjoin
Map<Byte, int[]> valueIndices = new HashMap<Byte, int[]>();
// get the join keys from old parent ReduceSink operators
Map<Byte, List<ExprNodeDesc>> keyExprMap = pair.getSecond();
if (!adjustParentsChildren) {
// Since we did not remove reduce sink parents, keep the original value expressions
newValueExprs = valueExprs;
// Join key exprs are represented in terms of the original table columns,
// we need to convert these to the generated column names we can see in the Join operator
Map<Byte, List<ExprNodeDesc>> newKeyExprMap = new HashMap<Byte, List<ExprNodeDesc>>();
for (Map.Entry<Byte, List<ExprNodeDesc>> mapEntry : keyExprMap.entrySet()) {
Byte pos = mapEntry.getKey();
ReduceSinkOperator rsParent = oldReduceSinkParentOps.get(pos.byteValue());
List<ExprNodeDesc> keyExprList = ExprNodeDescUtils.resolveJoinKeysAsRSColumns(mapEntry.getValue(), rsParent);
if (keyExprList == null) {
throw new SemanticException("Error resolving join keys");
}
newKeyExprMap.put(pos, keyExprList);
}
keyExprMap = newKeyExprMap;
}
// construct valueTableDescs and valueFilteredTableDescs
List<TableDesc> valueTableDescs = new ArrayList<TableDesc>();
List<TableDesc> valueFilteredTableDescs = new ArrayList<TableDesc>();
int[][] filterMap = desc.getFilterMap();
for (byte pos = 0; pos < op.getParentOperators().size(); pos++) {
List<ExprNodeDesc> valueCols = newValueExprs.get(pos);
if (pos != mapJoinPos) {
// remove values in key exprs for value table schema
// value expression for hashsink will be modified in
// LocalMapJoinProcessor
int[] valueIndex = new int[valueCols.size()];
List<ExprNodeDesc> valueColsInValueExpr = new ArrayList<ExprNodeDesc>();
for (int i = 0; i < valueIndex.length; i++) {
ExprNodeDesc expr = valueCols.get(i);
int kindex = ExprNodeDescUtils.indexOf(expr, keyExprMap.get(pos));
if (kindex >= 0) {
valueIndex[i] = kindex;
} else {
valueIndex[i] = -valueColsInValueExpr.size() - 1;
valueColsInValueExpr.add(expr);
}
}
if (needValueIndex(valueIndex)) {
valueIndices.put(pos, valueIndex);
}
valueCols = valueColsInValueExpr;
}
// deep copy expr node desc
List<ExprNodeDesc> valueFilteredCols = ExprNodeDescUtils.clone(valueCols);
if (filterMap != null && filterMap[pos] != null && pos != mapJoinPos) {
ExprNodeColumnDesc isFilterDesc = new ExprNodeColumnDesc(TypeInfoFactory.getPrimitiveTypeInfo(serdeConstants.SMALLINT_TYPE_NAME), "filter", "filter", false);
valueFilteredCols.add(isFilterDesc);
}
TableDesc valueTableDesc = PlanUtils.getMapJoinValueTableDesc(PlanUtils.getFieldSchemasFromColumnList(valueCols, "mapjoinvalue"));
TableDesc valueFilteredTableDesc = PlanUtils.getMapJoinValueTableDesc(PlanUtils.getFieldSchemasFromColumnList(valueFilteredCols, "mapjoinvalue"));
valueTableDescs.add(valueTableDesc);
valueFilteredTableDescs.add(valueFilteredTableDesc);
}
Map<Byte, List<ExprNodeDesc>> filters = desc.getFilters();
Map<Byte, List<ExprNodeDesc>> newFilters = new HashMap<Byte, List<ExprNodeDesc>>();
for (Map.Entry<Byte, List<ExprNodeDesc>> entry : filters.entrySet()) {
byte srcTag = entry.getKey();
List<ExprNodeDesc> filter = entry.getValue();
Operator<?> terminal = oldReduceSinkParentOps.get(srcTag);
newFilters.put(srcTag, ExprNodeDescUtils.backtrack(filter, op, terminal));
}
desc.setFilters(filters = newFilters);
// create dumpfile prefix needed to create descriptor
String dumpFilePrefix = "";
if (mapAliases != null) {
for (String mapAlias : mapAliases) {
dumpFilePrefix = dumpFilePrefix + mapAlias;
}
dumpFilePrefix = dumpFilePrefix + "-" + PlanUtils.getCountForMapJoinDumpFilePrefix();
} else {
dumpFilePrefix = "mapfile" + PlanUtils.getCountForMapJoinDumpFilePrefix();
}
List<ExprNodeDesc> keyCols = keyExprMap.get((byte) mapJoinPos);
List<String> outputColumnNames = op.getConf().getOutputColumnNames();
TableDesc keyTableDesc = PlanUtils.getMapJoinKeyTableDesc(hconf, PlanUtils.getFieldSchemasFromColumnList(keyCols, MAPJOINKEY_FIELDPREFIX));
JoinCondDesc[] joinCondns = op.getConf().getConds();
MapJoinDesc mapJoinDescriptor = new MapJoinDesc(keyExprMap, keyTableDesc, newValueExprs, valueTableDescs, valueFilteredTableDescs, outputColumnNames, mapJoinPos, joinCondns, filters, op.getConf().getNoOuterJoin(), dumpFilePrefix);
mapJoinDescriptor.setStatistics(op.getConf().getStatistics());
mapJoinDescriptor.setTagOrder(tagOrder);
mapJoinDescriptor.setNullSafes(desc.getNullSafes());
mapJoinDescriptor.setFilterMap(desc.getFilterMap());
mapJoinDescriptor.setResidualFilterExprs(desc.getResidualFilterExprs());
if (!valueIndices.isEmpty()) {
mapJoinDescriptor.setValueIndices(valueIndices);
}
return mapJoinDescriptor;
}
use of org.apache.hadoop.hive.ql.plan.JoinCondDesc in project hive by apache.
the class MapJoinProcessor method convertMapJoin.
/**
* convert a regular join to a a map-side join.
*
* @param opParseCtxMap
* @param op
* join operator
* @param joinTree
* qb join tree
* @param mapJoinPos
* position of the source to be read as part of map-reduce framework. All other sources
* are cached in memory
* @param noCheckOuterJoin
* @param validateMapJoinTree
*/
public MapJoinOperator convertMapJoin(HiveConf conf, JoinOperator op, boolean leftInputJoin, String[] baseSrc, List<String> mapAliases, int mapJoinPos, boolean noCheckOuterJoin, boolean validateMapJoinTree) throws SemanticException {
// outer join cannot be performed on a table which is being cached
JoinDesc desc = op.getConf();
JoinCondDesc[] condns = desc.getConds();
if (!noCheckOuterJoin) {
if (checkMapJoin(mapJoinPos, condns) < 0) {
throw new SemanticException(ErrorMsg.NO_OUTER_MAPJOIN.getMsg());
}
}
// Walk over all the sources (which are guaranteed to be reduce sink
// operators).
// The join outputs a concatenation of all the inputs.
List<Operator<? extends OperatorDesc>> parentOps = op.getParentOperators();
List<Operator<? extends OperatorDesc>> newParentOps = new ArrayList<Operator<? extends OperatorDesc>>();
List<Operator<? extends OperatorDesc>> oldReduceSinkParentOps = new ArrayList<Operator<? extends OperatorDesc>>();
// found a source which is not to be stored in memory
if (leftInputJoin) {
// assert mapJoinPos == 0;
Operator<? extends OperatorDesc> parentOp = parentOps.get(0);
assert parentOp.getParentOperators().size() == 1;
Operator<? extends OperatorDesc> grandParentOp = parentOp.getParentOperators().get(0);
oldReduceSinkParentOps.add(parentOp);
newParentOps.add(grandParentOp);
}
byte pos = 0;
// Remove parent reduce-sink operators
for (String src : baseSrc) {
if (src != null) {
Operator<? extends OperatorDesc> parentOp = parentOps.get(pos);
assert parentOp.getParentOperators().size() == 1;
Operator<? extends OperatorDesc> grandParentOp = parentOp.getParentOperators().get(0);
oldReduceSinkParentOps.add(parentOp);
newParentOps.add(grandParentOp);
}
pos++;
}
// create the map-join operator
MapJoinOperator mapJoinOp = convertJoinOpMapJoinOp(conf, op, leftInputJoin, baseSrc, mapAliases, mapJoinPos, noCheckOuterJoin);
// remove old parents
for (pos = 0; pos < newParentOps.size(); pos++) {
newParentOps.get(pos).replaceChild(oldReduceSinkParentOps.get(pos), mapJoinOp);
}
mapJoinOp.getParentOperators().removeAll(oldReduceSinkParentOps);
mapJoinOp.setParentOperators(newParentOps);
// make sure only map-joins can be performed.
if (validateMapJoinTree) {
validateMapJoinTypes(mapJoinOp);
}
return mapJoinOp;
}
use of org.apache.hadoop.hive.ql.plan.JoinCondDesc in project hive by apache.
the class MapJoinOperator method canSkipJoinProcessing.
// If the loaded hash table is empty, for some conditions we can skip processing the big table rows.
protected boolean canSkipJoinProcessing(ExecMapperContext mapContext) {
if (!canSkipReload(mapContext)) {
return false;
}
JoinCondDesc[] joinConds = getConf().getConds();
if (joinConds.length > 0) {
for (JoinCondDesc joinCond : joinConds) {
if (joinCond.getType() != JoinDesc.INNER_JOIN) {
return false;
}
}
} else {
return false;
}
boolean skipJoinProcessing = false;
for (int idx = 0; idx < mapJoinTables.length; ++idx) {
if (idx == getConf().getPosBigTable()) {
continue;
}
MapJoinTableContainer mapJoinTable = mapJoinTables[idx];
if (mapJoinTable.size() == 0) {
// If any table is empty, an inner join involving the tables should yield 0 rows.
LOG.info("Hash table number " + idx + " is empty");
skipJoinProcessing = true;
break;
}
}
return skipJoinProcessing;
}
use of org.apache.hadoop.hive.ql.plan.JoinCondDesc in project hive by apache.
the class HiveOpConverter method genJoin.
private static JoinOperator genJoin(RelNode join, ExprNodeDesc[][] joinExpressions, List<List<ExprNodeDesc>> filterExpressions, List<Operator<?>> children, String[] baseSrc, String tabAlias) throws SemanticException {
// 1. Extract join type
JoinCondDesc[] joinCondns;
boolean semiJoin;
boolean noOuterJoin;
if (join instanceof HiveMultiJoin) {
HiveMultiJoin hmj = (HiveMultiJoin) join;
joinCondns = new JoinCondDesc[hmj.getJoinInputs().size()];
for (int i = 0; i < hmj.getJoinInputs().size(); i++) {
joinCondns[i] = new JoinCondDesc(new JoinCond(hmj.getJoinInputs().get(i).left, hmj.getJoinInputs().get(i).right, transformJoinType(hmj.getJoinTypes().get(i))));
}
semiJoin = false;
noOuterJoin = !hmj.isOuterJoin();
} else {
joinCondns = new JoinCondDesc[1];
semiJoin = join instanceof SemiJoin;
JoinType joinType;
if (semiJoin) {
joinType = JoinType.LEFTSEMI;
} else {
joinType = extractJoinType((Join) join);
}
joinCondns[0] = new JoinCondDesc(new JoinCond(0, 1, joinType));
noOuterJoin = joinType != JoinType.FULLOUTER && joinType != JoinType.LEFTOUTER && joinType != JoinType.RIGHTOUTER;
}
// 2. We create the join aux structures
ArrayList<ColumnInfo> outputColumns = new ArrayList<ColumnInfo>();
ArrayList<String> outputColumnNames = new ArrayList<String>(join.getRowType().getFieldNames());
Operator<?>[] childOps = new Operator[children.size()];
Map<String, Byte> reversedExprs = new HashMap<String, Byte>();
Map<Byte, List<ExprNodeDesc>> exprMap = new HashMap<Byte, List<ExprNodeDesc>>();
Map<Byte, List<ExprNodeDesc>> filters = new HashMap<Byte, List<ExprNodeDesc>>();
Map<String, ExprNodeDesc> colExprMap = new HashMap<String, ExprNodeDesc>();
HashMap<Integer, Set<String>> posToAliasMap = new HashMap<Integer, Set<String>>();
int outputPos = 0;
for (int pos = 0; pos < children.size(); pos++) {
// 2.1. Backtracking from RS
ReduceSinkOperator inputRS = (ReduceSinkOperator) children.get(pos);
if (inputRS.getNumParent() != 1) {
throw new SemanticException("RS should have single parent");
}
Operator<?> parent = inputRS.getParentOperators().get(0);
ReduceSinkDesc rsDesc = inputRS.getConf();
int[] index = inputRS.getValueIndex();
Byte tag = (byte) rsDesc.getTag();
// 2.1.1. If semijoin...
if (semiJoin && pos != 0) {
exprMap.put(tag, new ArrayList<ExprNodeDesc>());
childOps[pos] = inputRS;
continue;
}
posToAliasMap.put(pos, new HashSet<String>(inputRS.getSchema().getTableNames()));
List<String> keyColNames = rsDesc.getOutputKeyColumnNames();
List<String> valColNames = rsDesc.getOutputValueColumnNames();
Map<String, ExprNodeDesc> descriptors = buildBacktrackFromReduceSinkForJoin(outputPos, outputColumnNames, keyColNames, valColNames, index, parent, baseSrc[pos]);
List<ColumnInfo> parentColumns = parent.getSchema().getSignature();
for (int i = 0; i < index.length; i++) {
ColumnInfo info = new ColumnInfo(parentColumns.get(i));
info.setInternalName(outputColumnNames.get(outputPos));
info.setTabAlias(tabAlias);
outputColumns.add(info);
reversedExprs.put(outputColumnNames.get(outputPos), tag);
outputPos++;
}
exprMap.put(tag, new ArrayList<ExprNodeDesc>(descriptors.values()));
colExprMap.putAll(descriptors);
childOps[pos] = inputRS;
}
// 3. We populate the filters and filterMap structure needed in the join descriptor
List<List<ExprNodeDesc>> filtersPerInput = Lists.newArrayList();
int[][] filterMap = new int[children.size()][];
for (int i = 0; i < children.size(); i++) {
filtersPerInput.add(new ArrayList<ExprNodeDesc>());
}
// 3. We populate the filters structure
for (int i = 0; i < filterExpressions.size(); i++) {
int leftPos = joinCondns[i].getLeft();
int rightPos = joinCondns[i].getRight();
for (ExprNodeDesc expr : filterExpressions.get(i)) {
// We need to update the exprNode, as currently
// they refer to columns in the output of the join;
// they should refer to the columns output by the RS
int inputPos = updateExprNode(expr, reversedExprs, colExprMap);
if (inputPos == -1) {
inputPos = leftPos;
}
filtersPerInput.get(inputPos).add(expr);
if (joinCondns[i].getType() == JoinDesc.FULL_OUTER_JOIN || joinCondns[i].getType() == JoinDesc.LEFT_OUTER_JOIN || joinCondns[i].getType() == JoinDesc.RIGHT_OUTER_JOIN) {
if (inputPos == leftPos) {
updateFilterMap(filterMap, leftPos, rightPos);
} else {
updateFilterMap(filterMap, rightPos, leftPos);
}
}
}
}
for (int pos = 0; pos < children.size(); pos++) {
ReduceSinkOperator inputRS = (ReduceSinkOperator) children.get(pos);
ReduceSinkDesc rsDesc = inputRS.getConf();
Byte tag = (byte) rsDesc.getTag();
filters.put(tag, filtersPerInput.get(pos));
}
// 4. We create the join operator with its descriptor
JoinDesc desc = new JoinDesc(exprMap, outputColumnNames, noOuterJoin, joinCondns, filters, joinExpressions);
desc.setReversedExprs(reversedExprs);
desc.setFilterMap(filterMap);
JoinOperator joinOp = (JoinOperator) OperatorFactory.getAndMakeChild(childOps[0].getCompilationOpContext(), desc, new RowSchema(outputColumns), childOps);
joinOp.setColumnExprMap(colExprMap);
joinOp.setPosToAliasMap(posToAliasMap);
joinOp.getConf().setBaseSrc(baseSrc);
if (LOG.isDebugEnabled()) {
LOG.debug("Generated " + joinOp + " with row schema: [" + joinOp.getSchema() + "]");
}
return joinOp;
}
use of org.apache.hadoop.hive.ql.plan.JoinCondDesc in project hive by apache.
the class CommonJoinOperator method genObject.
// creates objects in recursive manner
private void genObject(int aliasNum, boolean allLeftFirst, boolean allLeftNull) throws HiveException {
JoinCondDesc joinCond = condn[aliasNum - 1];
int type = joinCond.getType();
int left = joinCond.getLeft();
int right = joinCond.getRight();
if (needsPostEvaluation && aliasNum == numAliases - 2) {
int nextType = condn[aliasNum].getType();
if (nextType == JoinDesc.RIGHT_OUTER_JOIN || nextType == JoinDesc.FULL_OUTER_JOIN) {
// Initialize container to use for storing tuples before emitting them
rowContainerPostFilteredOuterJoin = new HashMap<>();
}
}
boolean[] skip = skipVectors[aliasNum];
boolean[] prevSkip = skipVectors[aliasNum - 1];
// search for match in the rhs table
AbstractRowContainer<List<Object>> aliasRes = storage[order[aliasNum]];
boolean needToProduceLeftRow = false;
boolean producedRow = false;
boolean done = false;
boolean loopAgain = false;
boolean tryLOForFO = type == JoinDesc.FULL_OUTER_JOIN;
boolean rightFirst = true;
AbstractRowContainer.RowIterator<List<Object>> iter = aliasRes.rowIter();
int pos = 0;
for (List<Object> rightObj = iter.first(); !done && rightObj != null; rightObj = loopAgain ? rightObj : iter.next(), rightFirst = loopAgain = false, pos++) {
System.arraycopy(prevSkip, 0, skip, 0, prevSkip.length);
boolean rightNull = rightObj == dummyObj[aliasNum];
if (hasFilter(order[aliasNum])) {
filterTags[aliasNum] = getFilterTag(rightObj);
}
skip[right] = rightNull;
if (type == JoinDesc.INNER_JOIN) {
innerJoin(skip, left, right);
} else if (type == JoinDesc.LEFT_SEMI_JOIN) {
if (innerJoin(skip, left, right)) {
// if left-semi-join found a match, skipping the rest of the rows in the
// rhs table of the semijoin
done = true;
}
} else if (type == JoinDesc.LEFT_OUTER_JOIN || (type == JoinDesc.FULL_OUTER_JOIN && rightNull)) {
int result = leftOuterJoin(skip, left, right);
if (result < 0) {
continue;
}
done = result > 0;
} else if (type == JoinDesc.RIGHT_OUTER_JOIN || (type == JoinDesc.FULL_OUTER_JOIN && allLeftNull)) {
if (allLeftFirst && !rightOuterJoin(skip, left, right) || !allLeftFirst && !innerJoin(skip, left, right)) {
continue;
}
} else if (type == JoinDesc.FULL_OUTER_JOIN) {
if (tryLOForFO && leftOuterJoin(skip, left, right) > 0) {
loopAgain = allLeftFirst;
done = !loopAgain;
tryLOForFO = false;
} else if (allLeftFirst && !rightOuterJoin(skip, left, right) || !allLeftFirst && !innerJoin(skip, left, right)) {
continue;
}
}
intermediate[aliasNum] = rightObj;
if (aliasNum == numAliases - 1) {
if (!(allLeftNull && rightNull)) {
needToProduceLeftRow = true;
if (needsPostEvaluation) {
// This is only executed for outer joins with residual filters
boolean forward = createForwardJoinObject(skipVectors[numAliases - 1]);
producedRow |= forward;
if (!rightNull && (type == JoinDesc.RIGHT_OUTER_JOIN || type == JoinDesc.FULL_OUTER_JOIN)) {
if (forward) {
// This record produced a result this time, remove it from the storage
// as it will not need to produce a result with NULL values anymore
rowContainerPostFilteredOuterJoin.put(pos, null);
} else {
// we should produce a result
if (!rowContainerPostFilteredOuterJoin.containsKey(pos)) {
Object[] row = Arrays.copyOfRange(forwardCache, offsets[aliasNum], offsets[aliasNum + 1]);
rowContainerPostFilteredOuterJoin.put(pos, row);
}
}
}
} else {
createForwardJoinObject(skipVectors[numAliases - 1]);
}
}
} else {
// recursively call the join the other rhs tables
genObject(aliasNum + 1, allLeftFirst && rightFirst, allLeftNull && rightNull);
}
}
// Consolidation for outer joins
if (needsPostEvaluation && aliasNum == numAliases - 1 && needToProduceLeftRow && !producedRow && !allLeftNull) {
if (type == JoinDesc.LEFT_OUTER_JOIN || type == JoinDesc.FULL_OUTER_JOIN) {
// If it is a LEFT / FULL OUTER JOIN and the left record did not produce
// results, we need to take that record, replace the right side with NULL
// values, and produce the records
int i = numAliases - 1;
for (int j = offsets[i]; j < offsets[i + 1]; j++) {
forwardCache[j] = null;
}
internalForward(forwardCache, outputObjInspector);
countAfterReport = 0;
}
} else if (needsPostEvaluation && aliasNum == numAliases - 2) {
int nextType = condn[aliasNum].getType();
if (nextType == JoinDesc.RIGHT_OUTER_JOIN || nextType == JoinDesc.FULL_OUTER_JOIN) {
// If it is a RIGHT / FULL OUTER JOIN, we need to iterate through the row container
// that contains all the right records that did not produce results. Then, for each
// of those records, we replace the left side with NULL values, and produce the
// records.
// Observe that we only enter this block when we have finished iterating through
// all the left and right records (aliasNum == numAliases - 2), and thus, we have
// tried to evaluate the post-filter condition on every possible combination.
Arrays.fill(forwardCache, null);
for (Object[] row : rowContainerPostFilteredOuterJoin.values()) {
if (row == null) {
continue;
}
System.arraycopy(row, 0, forwardCache, offsets[numAliases - 1], row.length);
internalForward(forwardCache, outputObjInspector);
countAfterReport = 0;
}
}
}
}
Aggregations