use of org.apache.hadoop.hive.ql.exec.UnionOperator in project hive by apache.
the class GenMRFileSink1 method processFS.
/**
* Process the FileSink operator to generate a MoveTask if necessary.
*
* @param fsOp
* current FileSink operator
* @param stack
* parent operators
* @param opProcCtx
* @param chDir
* whether the operator should be first output to a tmp dir and then merged
* to the final dir later
* @return the final file name to which the FileSinkOperator should store.
* @throws SemanticException
*/
private Path processFS(FileSinkOperator fsOp, Stack<Node> stack, NodeProcessorCtx opProcCtx, boolean chDir) throws SemanticException {
GenMRProcContext ctx = (GenMRProcContext) opProcCtx;
Task<? extends Serializable> currTask = ctx.getCurrTask();
// If the directory needs to be changed, send the new directory
Path dest = null;
List<FileSinkOperator> seenFSOps = ctx.getSeenFileSinkOps();
if (seenFSOps == null) {
seenFSOps = new ArrayList<FileSinkOperator>();
}
if (!seenFSOps.contains(fsOp)) {
seenFSOps.add(fsOp);
}
ctx.setSeenFileSinkOps(seenFSOps);
dest = GenMapRedUtils.createMoveTask(ctx.getCurrTask(), chDir, fsOp, ctx.getParseCtx(), ctx.getMvTask(), ctx.getConf(), ctx.getDependencyTaskForMultiInsert());
TableScanOperator currTopOp = ctx.getCurrTopOp();
String currAliasId = ctx.getCurrAliasId();
HashMap<Operator<? extends OperatorDesc>, Task<? extends Serializable>> opTaskMap = ctx.getOpTaskMap();
// If it is a map-only job, the task needs to be processed
if (currTopOp != null) {
Task<? extends Serializable> mapTask = opTaskMap.get(null);
if (mapTask == null) {
if (!ctx.isSeenOp(currTask, currTopOp)) {
GenMapRedUtils.setTaskPlan(currAliasId, currTopOp, currTask, false, ctx);
}
opTaskMap.put(null, currTask);
} else {
if (!ctx.isSeenOp(currTask, currTopOp)) {
GenMapRedUtils.setTaskPlan(currAliasId, currTopOp, mapTask, false, ctx);
} else {
UnionOperator currUnionOp = ctx.getCurrUnionOp();
if (currUnionOp != null) {
opTaskMap.put(null, currTask);
ctx.setCurrTopOp(null);
GenMapRedUtils.initUnionPlan(ctx, currUnionOp, currTask, false);
return dest;
}
}
// mapTask and currTask should be merged by and join/union operator
// (e.g., GenMRUnion1) which has multiple topOps.
// assert mapTask == currTask : "mapTask.id = " + mapTask.getId()
// + "; currTask.id = " + currTask.getId();
}
return dest;
}
UnionOperator currUnionOp = ctx.getCurrUnionOp();
if (currUnionOp != null) {
opTaskMap.put(null, currTask);
GenMapRedUtils.initUnionPlan(ctx, currUnionOp, currTask, false);
return dest;
}
return dest;
}
use of org.apache.hadoop.hive.ql.exec.UnionOperator in project hive by apache.
the class GenMRRedSink3 method process.
/**
* Reduce Scan encountered.
*
* @param nd
* the reduce sink operator encountered
* @param opProcCtx
* context
*/
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx opProcCtx, Object... nodeOutputs) throws SemanticException {
ReduceSinkOperator op = (ReduceSinkOperator) nd;
GenMRProcContext ctx = (GenMRProcContext) opProcCtx;
// union consisted on a bunch of map-reduce jobs, and it has been split at
// the union
Operator<? extends OperatorDesc> reducer = op.getChildOperators().get(0);
UnionOperator union = Utils.findNode(stack, UnionOperator.class);
assert union != null;
Map<Operator<? extends OperatorDesc>, GenMapRedCtx> mapCurrCtx = ctx.getMapCurrCtx();
GenMapRedCtx mapredCtx = mapCurrCtx.get(union);
Task<? extends Serializable> unionTask = null;
if (mapredCtx != null) {
unionTask = mapredCtx.getCurrTask();
} else {
unionTask = ctx.getCurrTask();
}
MapredWork plan = (MapredWork) unionTask.getWork();
HashMap<Operator<? extends OperatorDesc>, Task<? extends Serializable>> opTaskMap = ctx.getOpTaskMap();
Task<? extends Serializable> reducerTask = opTaskMap.get(reducer);
ctx.setCurrTask(unionTask);
// If the plan for this reducer does not exist, initialize the plan
if (reducerTask == null) {
// When the reducer is encountered for the first time
if (plan.getReduceWork() == null) {
GenMapRedUtils.initUnionPlan(op, union, ctx, unionTask);
// When union is followed by a multi-table insert
} else {
GenMapRedUtils.splitPlan(op, ctx);
}
} else if (plan.getReduceWork() != null && plan.getReduceWork().getReducer() == reducer) {
// The union is already initialized. However, the union is walked from
// another input
// initUnionPlan is idempotent
GenMapRedUtils.initUnionPlan(op, union, ctx, unionTask);
} else {
GenMapRedUtils.joinUnionPlan(ctx, union, unionTask, reducerTask, false);
ctx.setCurrTask(reducerTask);
}
mapCurrCtx.put(op, new GenMapRedCtx(ctx.getCurrTask(), ctx.getCurrAliasId()));
// the union operator has been processed
ctx.setCurrUnionOp(null);
return true;
}
use of org.apache.hadoop.hive.ql.exec.UnionOperator in project hive by apache.
the class ColumnPrunerProcCtx method handleFilterUnionChildren.
/**
* If the input filter operator has direct child(ren) which are union operator,
* and the filter's column is not the same as union's
* create select operator between them. The select operator has same number of columns as
* pruned child operator.
*
* @param curOp
* The filter operator which need to handle children.
* @throws SemanticException
*/
public void handleFilterUnionChildren(Operator<? extends OperatorDesc> curOp) throws SemanticException {
if (curOp.getChildOperators() == null || !(curOp instanceof FilterOperator)) {
return;
}
List<FieldNode> parentPrunList = prunedColLists.get(curOp);
if (parentPrunList == null || parentPrunList.size() == 0) {
return;
}
List<FieldNode> prunList = null;
for (Operator<? extends OperatorDesc> child : curOp.getChildOperators()) {
if (child instanceof UnionOperator) {
prunList = genColLists(child);
if (prunList == null || prunList.size() == 0 || parentPrunList.size() == prunList.size()) {
continue;
}
ArrayList<ExprNodeDesc> exprs = new ArrayList<ExprNodeDesc>();
ArrayList<String> outputColNames = new ArrayList<String>();
Map<String, ExprNodeDesc> colExprMap = new HashMap<String, ExprNodeDesc>();
ArrayList<ColumnInfo> outputRS = new ArrayList<ColumnInfo>();
for (ColumnInfo colInfo : child.getSchema().getSignature()) {
if (lookupColumn(prunList, colInfo.getInternalName()) == null) {
continue;
}
ExprNodeDesc colDesc = new ExprNodeColumnDesc(colInfo.getType(), colInfo.getInternalName(), colInfo.getTabAlias(), colInfo.getIsVirtualCol());
exprs.add(colDesc);
outputColNames.add(colInfo.getInternalName());
ColumnInfo newCol = new ColumnInfo(colInfo.getInternalName(), colInfo.getType(), colInfo.getTabAlias(), colInfo.getIsVirtualCol(), colInfo.isHiddenVirtualCol());
newCol.setAlias(colInfo.getAlias());
outputRS.add(newCol);
colExprMap.put(colInfo.getInternalName(), colDesc);
}
SelectDesc select = new SelectDesc(exprs, outputColNames, false);
curOp.removeChild(child);
SelectOperator sel = (SelectOperator) OperatorFactory.getAndMakeChild(select, new RowSchema(outputRS), curOp);
OperatorFactory.makeChild(sel, child);
sel.setColumnExprMap(colExprMap);
}
}
}
use of org.apache.hadoop.hive.ql.exec.UnionOperator in project hive by apache.
the class ConstantPropagateProcCtx method getPropagatedConstants.
/**
* Get propagated constant map from parents.
*
* Traverse all parents of current operator, if there is propagated constant (determined by
* assignment expression like column=constant value), resolve the column using RowResolver and add
* it to current constant map.
*
* @param op
* operator getting the propagated constants.
* @return map of ColumnInfo to ExprNodeDesc. The values of that map must be either
* ExprNodeConstantDesc or ExprNodeNullDesc.
*/
public Map<ColumnInfo, ExprNodeDesc> getPropagatedConstants(Operator<? extends Serializable> op) {
// this map should map columnInfo to ExprConstantNodeDesc
Map<ColumnInfo, ExprNodeDesc> constants = new HashMap<ColumnInfo, ExprNodeDesc>();
if (op.getSchema() == null) {
return constants;
}
RowSchema rs = op.getSchema();
LOG.debug("Getting constants of op:" + op + " with rs:" + rs);
if (op.getParentOperators() == null) {
return constants;
}
// A previous solution is based on tableAlias and colAlias, which is
// unsafe, esp. when CBO generates derived table names. see HIVE-13602.
// For correctness purpose, we only trust colExpMap.
// We assume that CBO can do the constantPropagation before this function is
// called to help improve the performance.
// UnionOperator, LimitOperator and FilterOperator are special, they should already be
// column-position aligned.
List<Map<Integer, ExprNodeDesc>> parentsToConstant = new ArrayList<>();
boolean areAllParentsContainConstant = true;
boolean noParentsContainConstant = true;
for (Operator<?> parent : op.getParentOperators()) {
Map<ColumnInfo, ExprNodeDesc> constMap = opToConstantExprs.get(parent);
if (constMap == null) {
LOG.debug("Constant of Op " + parent.getOperatorId() + " is not found");
areAllParentsContainConstant = false;
} else {
noParentsContainConstant = false;
Map<Integer, ExprNodeDesc> map = new HashMap<>();
for (Entry<ColumnInfo, ExprNodeDesc> entry : constMap.entrySet()) {
map.put(parent.getSchema().getPosition(entry.getKey().getInternalName()), entry.getValue());
}
parentsToConstant.add(map);
LOG.debug("Constant of Op " + parent.getOperatorId() + " " + constMap);
}
}
if (noParentsContainConstant) {
return constants;
}
ArrayList<ColumnInfo> signature = op.getSchema().getSignature();
if (op instanceof LimitOperator || op instanceof FilterOperator) {
// there should be only one parent.
if (op.getParentOperators().size() == 1) {
Map<Integer, ExprNodeDesc> parentToConstant = parentsToConstant.get(0);
for (int index = 0; index < signature.size(); index++) {
if (parentToConstant.containsKey(index)) {
constants.put(signature.get(index), parentToConstant.get(index));
}
}
}
} else if (op instanceof UnionOperator && areAllParentsContainConstant) {
for (int index = 0; index < signature.size(); index++) {
ExprNodeDesc constant = null;
for (Map<Integer, ExprNodeDesc> parentToConstant : parentsToConstant) {
if (!parentToConstant.containsKey(index)) {
// if this parent does not contain a constant at this position, we
// continue to look at other positions.
constant = null;
break;
} else {
if (constant == null) {
constant = parentToConstant.get(index);
} else {
// compare if they are the same constant.
ExprNodeDesc nextConstant = parentToConstant.get(index);
if (!nextConstant.isSame(constant)) {
// they are not the same constant. for example, union all of 1
// and 2.
constant = null;
break;
}
}
}
}
// we have checked all the parents for the "index" position.
if (constant != null) {
constants.put(signature.get(index), constant);
}
}
} else if (op instanceof JoinOperator) {
JoinOperator joinOp = (JoinOperator) op;
Iterator<Entry<Byte, List<ExprNodeDesc>>> itr = joinOp.getConf().getExprs().entrySet().iterator();
while (itr.hasNext()) {
Entry<Byte, List<ExprNodeDesc>> e = itr.next();
int tag = e.getKey();
Operator<?> parent = op.getParentOperators().get(tag);
List<ExprNodeDesc> exprs = e.getValue();
if (exprs == null) {
continue;
}
for (ExprNodeDesc expr : exprs) {
// we are only interested in ExprNodeColumnDesc
if (expr instanceof ExprNodeColumnDesc) {
String parentColName = ((ExprNodeColumnDesc) expr).getColumn();
// find this parentColName in its parent's rs
int parentPos = parent.getSchema().getPosition(parentColName);
if (parentsToConstant.get(tag).containsKey(parentPos)) {
// reverse look up colExprMap to find the childColName
if (op.getColumnExprMap() != null && op.getColumnExprMap().entrySet() != null) {
for (Entry<String, ExprNodeDesc> entry : op.getColumnExprMap().entrySet()) {
if (entry.getValue().isSame(expr)) {
// now propagate the constant from the parent to the child
constants.put(signature.get(op.getSchema().getPosition(entry.getKey())), parentsToConstant.get(tag).get(parentPos));
}
}
}
}
}
}
}
} else {
// there should be only one parent.
if (op.getParentOperators().size() == 1) {
Operator<?> parent = op.getParentOperators().get(0);
if (op.getColumnExprMap() != null && op.getColumnExprMap().entrySet() != null) {
for (Entry<String, ExprNodeDesc> entry : op.getColumnExprMap().entrySet()) {
if (op.getSchema().getPosition(entry.getKey()) == -1) {
// Not present
continue;
}
ExprNodeDesc expr = entry.getValue();
if (expr instanceof ExprNodeColumnDesc) {
String parentColName = ((ExprNodeColumnDesc) expr).getColumn();
// find this parentColName in its parent's rs
int parentPos = parent.getSchema().getPosition(parentColName);
if (parentsToConstant.get(0).containsKey(parentPos)) {
// this position in parent is a constant
// now propagate the constant from the parent to the child
constants.put(signature.get(op.getSchema().getPosition(entry.getKey())), parentsToConstant.get(0).get(parentPos));
}
}
}
}
}
}
LOG.debug("Offering constants " + constants.keySet() + " to operator " + op.toString());
return constants;
}
use of org.apache.hadoop.hive.ql.exec.UnionOperator in project hive by apache.
the class GenMRUnion1 method process.
/**
* Union Operator encountered . Currently, the algorithm is pretty simple: If
* all the sub-queries are map-only, don't do anything. Otherwise, insert a
* FileSink on top of all the sub-queries.
*
* This can be optimized later on.
*
* @param nd
* the file sink operator encountered
* @param opProcCtx
* context
*/
@Override
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx opProcCtx, Object... nodeOutputs) throws SemanticException {
UnionOperator union = (UnionOperator) nd;
GenMRProcContext ctx = (GenMRProcContext) opProcCtx;
ParseContext parseCtx = ctx.getParseCtx();
UnionProcContext uCtx = parseCtx.getUCtx();
// Map-only subqueries can be optimized in future to not write to a file in
// future
Map<Operator<? extends OperatorDesc>, GenMapRedCtx> mapCurrCtx = ctx.getMapCurrCtx();
if (union.getConf().isAllInputsInSameReducer()) {
// All inputs of this UnionOperator are in the same Reducer.
// We do not need to break the operator tree.
mapCurrCtx.put((Operator<? extends OperatorDesc>) nd, new GenMapRedCtx(ctx.getCurrTask(), ctx.getCurrAliasId()));
return null;
}
UnionParseContext uPrsCtx = uCtx.getUnionParseContext(union);
ctx.setCurrUnionOp(union);
// map-reduce job
if (uPrsCtx.allMapOnlySubQ()) {
return processMapOnlyUnion(union, stack, ctx, uCtx);
}
assert uPrsCtx != null;
Task<? extends Serializable> currTask = ctx.getCurrTask();
int pos = UnionProcFactory.getPositionParent(union, stack);
Task<? extends Serializable> uTask = null;
MapredWork uPlan = null;
// union is encountered for the first time
GenMRUnionCtx uCtxTask = ctx.getUnionTask(union);
if (uCtxTask == null) {
uPlan = GenMapRedUtils.getMapRedWork(parseCtx);
uTask = TaskFactory.get(uPlan, parseCtx.getConf());
uCtxTask = new GenMRUnionCtx(uTask);
ctx.setUnionTask(union, uCtxTask);
} else {
uTask = uCtxTask.getUTask();
}
// Copy into the current union task plan if
if (uPrsCtx.getMapOnlySubq(pos) && uPrsCtx.getRootTask(pos)) {
processSubQueryUnionMerge(ctx, uCtxTask, union, stack);
if (ctx.getRootTasks().contains(currTask)) {
ctx.getRootTasks().remove(currTask);
}
} else // If it a map-reduce job, create a temporary file
{
// is the current task a root task
if (shouldBeRootTask(currTask) && !ctx.getRootTasks().contains(currTask) && (currTask.getParentTasks() == null || currTask.getParentTasks().isEmpty())) {
ctx.getRootTasks().add(currTask);
}
processSubQueryUnionCreateIntermediate(union.getParentOperators().get(pos), union, uTask, ctx, uCtxTask);
// the currAliasId and CurrTopOp is not valid any more
ctx.setCurrAliasId(null);
ctx.setCurrTopOp(null);
ctx.getOpTaskMap().put(null, uTask);
}
ctx.setCurrTask(uTask);
mapCurrCtx.put((Operator<? extends OperatorDesc>) nd, new GenMapRedCtx(ctx.getCurrTask(), null));
return true;
}
Aggregations