use of org.apache.hadoop.hive.ql.exec.DummyStoreOperator in project hive by apache.
the class GenTezWork method process.
@Override
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procContext, Object... nodeOutputs) throws SemanticException {
GenTezProcContext context = (GenTezProcContext) procContext;
assert context != null && context.currentTask != null && context.currentRootOperator != null;
// Operator is a file sink or reduce sink. Something that forces
// a new vertex.
Operator<?> operator = (Operator<?>) nd;
// root is the start of the operator pipeline we're currently
// packing into a vertex, typically a table scan, union or join
Operator<?> root = context.currentRootOperator;
LOG.debug("Root operator: " + root);
LOG.debug("Leaf operator: " + operator);
if (context.clonedReduceSinks.contains(operator)) {
// just skip and keep going
return null;
}
TezWork tezWork = context.currentTask.getWork();
// Right now the work graph is pretty simple. If there is no
// Preceding work we have a root and will generate a map
// vertex. If there is a preceding work we will generate
// a reduce vertex
BaseWork work;
if (context.rootToWorkMap.containsKey(root)) {
// will result into a vertex with multiple FS or RS operators.
if (context.childToWorkMap.containsKey(operator)) {
// if we've seen both root and child, we can bail.
// clear out the mapjoin set. we don't need it anymore.
context.currentMapJoinOperators.clear();
// clear out the union set. we don't need it anymore.
context.currentUnionOperators.clear();
return null;
} else {
// At this point we don't have to do anything special. Just
// run through the regular paces w/o creating a new task.
work = context.rootToWorkMap.get(root);
}
} else {
// create a new vertex
if (context.preceedingWork == null) {
work = utils.createMapWork(context, root, tezWork, null);
} else {
work = GenTezUtils.createReduceWork(context, root, tezWork);
}
context.rootToWorkMap.put(root, work);
}
// this is where we set the sort columns that we will be using for KeyValueInputMerge
if (operator instanceof DummyStoreOperator) {
work.addSortCols(root.getOpTraits().getSortCols().get(0));
}
if (!context.childToWorkMap.containsKey(operator)) {
List<BaseWork> workItems = new LinkedList<BaseWork>();
workItems.add(work);
context.childToWorkMap.put(operator, workItems);
} else {
context.childToWorkMap.get(operator).add(work);
}
// which can affect the working of all downstream transformations.
if (context.currentMergeJoinOperator != null) {
// we are currently walking the big table side of the merge join. we need to create or hook up
// merge join work.
MergeJoinWork mergeJoinWork = null;
if (context.opMergeJoinWorkMap.containsKey(context.currentMergeJoinOperator)) {
// we have found a merge work corresponding to this closing operator. Hook up this work.
mergeJoinWork = context.opMergeJoinWorkMap.get(context.currentMergeJoinOperator);
} else {
// we need to create the merge join work
mergeJoinWork = new MergeJoinWork();
mergeJoinWork.setMergeJoinOperator(context.currentMergeJoinOperator);
tezWork.add(mergeJoinWork);
context.opMergeJoinWorkMap.put(context.currentMergeJoinOperator, mergeJoinWork);
}
// connect the work correctly.
work.addSortCols(root.getOpTraits().getSortCols().get(0));
mergeJoinWork.addMergedWork(work, null, context.leafOperatorToFollowingWork);
Operator<? extends OperatorDesc> parentOp = getParentFromStack(context.currentMergeJoinOperator, stack);
// Set the big table position. Both the reduce work and merge join operator
// should be set with the same value.
// int pos = context.currentMergeJoinOperator.getTagForOperator(parentOp);
int pos = context.currentMergeJoinOperator.getConf().getBigTablePosition();
work.setTag(pos);
context.currentMergeJoinOperator.getConf().setBigTablePosition(pos);
tezWork.setVertexType(work, VertexType.MULTI_INPUT_UNINITIALIZED_EDGES);
for (BaseWork parentWork : tezWork.getParents(work)) {
TezEdgeProperty edgeProp = tezWork.getEdgeProperty(parentWork, work);
tezWork.disconnect(parentWork, work);
tezWork.connect(parentWork, mergeJoinWork, edgeProp);
}
for (BaseWork childWork : tezWork.getChildren(work)) {
TezEdgeProperty edgeProp = tezWork.getEdgeProperty(work, childWork);
tezWork.disconnect(work, childWork);
tezWork.connect(mergeJoinWork, childWork, edgeProp);
}
tezWork.remove(work);
context.rootToWorkMap.put(root, mergeJoinWork);
context.childToWorkMap.get(operator).remove(work);
context.childToWorkMap.get(operator).add(mergeJoinWork);
work = mergeJoinWork;
context.currentMergeJoinOperator = null;
}
// remember which mapjoin operator links with which work
if (!context.currentMapJoinOperators.isEmpty()) {
for (MapJoinOperator mj : context.currentMapJoinOperators) {
// so we can later run the same logic that is run in ReduceSinkMapJoinProc.
if (mj.getConf().isDynamicPartitionHashJoin()) {
// Since this is a dynamic partitioned hash join, the work for this join should be a ReduceWork
ReduceWork reduceWork = (ReduceWork) work;
int bigTablePosition = mj.getConf().getPosBigTable();
reduceWork.setTag(bigTablePosition);
// Use context.mapJoinParentMap to get the original RS parents, because
// the MapJoin's parents may have been replaced by dummy operator.
List<Operator<?>> mapJoinOriginalParents = context.mapJoinParentMap.get(mj);
if (mapJoinOriginalParents == null) {
throw new SemanticException("Unexpected error - context.mapJoinParentMap did not have an entry for " + mj);
}
for (int pos = 0; pos < mapJoinOriginalParents.size(); ++pos) {
// This processing only needs to happen for the small tables
if (pos == bigTablePosition) {
continue;
}
Operator<?> parentOp = mapJoinOriginalParents.get(pos);
context.smallTableParentToMapJoinMap.put(parentOp, mj);
ReduceSinkOperator parentRS = (ReduceSinkOperator) parentOp;
// TableDesc needed for dynamic partitioned hash join
GenMapRedUtils.setKeyAndValueDesc(reduceWork, parentRS);
// has its ReduceSink parent removed.
if (!context.mapJoinToUnprocessedSmallTableReduceSinks.get(mj).contains(parentRS)) {
// This reduce sink has been processed already, so the work for the parentRS exists
BaseWork parentWork = ReduceSinkMapJoinProc.getMapJoinParentWork(context, parentRS);
int tag = parentRS.getConf().getTag();
tag = (tag == -1 ? 0 : tag);
reduceWork.getTagToInput().put(tag, parentWork.getName());
}
}
}
LOG.debug("Processing map join: " + mj);
// mapjoin later
if (!context.mapJoinWorkMap.containsKey(mj)) {
List<BaseWork> workItems = new LinkedList<BaseWork>();
workItems.add(work);
context.mapJoinWorkMap.put(mj, workItems);
} else {
context.mapJoinWorkMap.get(mj).add(work);
}
/*
* this happens in case of map join operations.
* The tree looks like this:
*
* RS <--- we are here perhaps
* |
* MapJoin
* / \
* RS TS
* /
* TS
*
* If we are at the RS pointed above, and we may have already visited the
* RS following the TS, we have already generated work for the TS-RS.
* We need to hook the current work to this generated work.
*/
if (context.linkOpWithWorkMap.containsKey(mj)) {
Map<BaseWork, TezEdgeProperty> linkWorkMap = context.linkOpWithWorkMap.get(mj);
if (linkWorkMap != null) {
// Note: it's not quite clear why this is done inside this if. Seems like it should be on the top level.
if (context.linkChildOpWithDummyOp.containsKey(mj)) {
if (LOG.isDebugEnabled()) {
LOG.debug("Adding dummy ops to work: " + work.getName() + ": " + context.linkChildOpWithDummyOp.get(mj));
}
for (Operator<?> dummy : context.linkChildOpWithDummyOp.get(mj)) {
work.addDummyOp((HashTableDummyOperator) dummy);
}
}
for (Entry<BaseWork, TezEdgeProperty> parentWorkMap : linkWorkMap.entrySet()) {
BaseWork parentWork = parentWorkMap.getKey();
LOG.debug("connecting " + parentWork.getName() + " with " + work.getName());
TezEdgeProperty edgeProp = parentWorkMap.getValue();
tezWork.connect(parentWork, work, edgeProp);
if (edgeProp.getEdgeType() == EdgeType.CUSTOM_EDGE) {
tezWork.setVertexType(work, VertexType.INITIALIZED_EDGES);
}
// of the downstream work
for (ReduceSinkOperator r : context.linkWorkWithReduceSinkMap.get(parentWork)) {
if (r.getConf().getOutputName() != null) {
LOG.debug("Cloning reduce sink for multi-child broadcast edge");
// we've already set this one up. Need to clone for the next work.
r = (ReduceSinkOperator) OperatorFactory.getAndMakeChild(r.getCompilationOpContext(), (ReduceSinkDesc) r.getConf().clone(), new RowSchema(r.getSchema()), r.getParentOperators());
context.clonedReduceSinks.add(r);
}
r.getConf().setOutputName(work.getName());
context.connectedReduceSinks.add(r);
}
}
}
}
}
// clear out the set. we don't need it anymore.
context.currentMapJoinOperators.clear();
}
// we might have to connect parent work with this work later.
for (Operator<?> parent : new ArrayList<Operator<?>>(root.getParentOperators())) {
if (LOG.isDebugEnabled()) {
LOG.debug("Removing " + parent + " as parent from " + root);
}
context.leafOperatorToFollowingWork.remove(parent);
context.leafOperatorToFollowingWork.put(parent, work);
root.removeParent(parent);
}
if (!context.currentUnionOperators.isEmpty()) {
// if there are union all operators, it means that the walking context contains union all operators.
// please see more details of context.currentUnionOperator in GenTezWorkWalker
UnionWork unionWork;
if (context.unionWorkMap.containsKey(operator)) {
// since we've passed this operator before.
assert operator.getChildOperators().isEmpty();
unionWork = (UnionWork) context.unionWorkMap.get(operator);
// finally connect the union work with work
connectUnionWorkWithWork(unionWork, work, tezWork, context);
} else {
// we've not seen this terminal before. we need to check
// rootUnionWorkMap which contains the information of mapping the root
// operator of a union work to a union work
unionWork = context.rootUnionWorkMap.get(root);
if (unionWork == null) {
// if unionWork is null, it means it is the first time. we need to
// create a union work object and add this work to it. Subsequent
// work should reference the union and not the actual work.
unionWork = GenTezUtils.createUnionWork(context, root, operator, tezWork);
// finally connect the union work with work
connectUnionWorkWithWork(unionWork, work, tezWork, context);
}
}
context.currentUnionOperators.clear();
work = unionWork;
}
// reasons. Roots are data sources, leaves are data sinks. I know.
if (context.leafOperatorToFollowingWork.containsKey(operator)) {
BaseWork followingWork = context.leafOperatorToFollowingWork.get(operator);
long bytesPerReducer = context.conf.getLongVar(HiveConf.ConfVars.BYTESPERREDUCER);
LOG.debug("Second pass. Leaf operator: " + operator + " has common downstream work:" + followingWork);
if (operator instanceof DummyStoreOperator) {
// this is the small table side.
assert (followingWork instanceof MergeJoinWork);
MergeJoinWork mergeJoinWork = (MergeJoinWork) followingWork;
CommonMergeJoinOperator mergeJoinOp = mergeJoinWork.getMergeJoinOperator();
work.setTag(mergeJoinOp.getTagForOperator(operator));
mergeJoinWork.addMergedWork(null, work, context.leafOperatorToFollowingWork);
tezWork.setVertexType(mergeJoinWork, VertexType.MULTI_INPUT_UNINITIALIZED_EDGES);
for (BaseWork parentWork : tezWork.getParents(work)) {
TezEdgeProperty edgeProp = tezWork.getEdgeProperty(parentWork, work);
tezWork.disconnect(parentWork, work);
tezWork.connect(parentWork, mergeJoinWork, edgeProp);
}
work = mergeJoinWork;
} else {
// need to add this branch to the key + value info
assert operator instanceof ReduceSinkOperator && ((followingWork instanceof ReduceWork) || (followingWork instanceof MergeJoinWork) || followingWork instanceof UnionWork);
ReduceSinkOperator rs = (ReduceSinkOperator) operator;
ReduceWork rWork = null;
if (followingWork instanceof MergeJoinWork) {
MergeJoinWork mergeJoinWork = (MergeJoinWork) followingWork;
rWork = (ReduceWork) mergeJoinWork.getMainWork();
} else if (followingWork instanceof UnionWork) {
// this can only be possible if there is merge work followed by the union
UnionWork unionWork = (UnionWork) followingWork;
int index = getFollowingWorkIndex(tezWork, unionWork, rs);
BaseWork baseWork = tezWork.getChildren(unionWork).get(index);
if (baseWork instanceof MergeJoinWork) {
MergeJoinWork mergeJoinWork = (MergeJoinWork) baseWork;
// disconnect the connection to union work and connect to merge work
followingWork = mergeJoinWork;
rWork = (ReduceWork) mergeJoinWork.getMainWork();
} else {
rWork = (ReduceWork) baseWork;
}
} else {
rWork = (ReduceWork) followingWork;
}
GenMapRedUtils.setKeyAndValueDesc(rWork, rs);
// remember which parent belongs to which tag
int tag = rs.getConf().getTag();
rWork.getTagToInput().put(tag == -1 ? 0 : tag, work.getName());
// remember the output name of the reduce sink
rs.getConf().setOutputName(rWork.getName());
// For dynamic partitioned hash join, run the ReduceSinkMapJoinProc logic for any
// ReduceSink parents that we missed.
MapJoinOperator mj = context.smallTableParentToMapJoinMap.get(rs);
if (mj != null) {
// Only need to run the logic for tables we missed
if (context.mapJoinToUnprocessedSmallTableReduceSinks.get(mj).contains(rs)) {
// ReduceSinkMapJoinProc logic does not work unless the ReduceSink is connected as
// a parent of the MapJoin, but at this point we have already removed all of the
// parents from the MapJoin.
// Try temporarily adding the RS as a parent
ArrayList<Operator<?>> tempMJParents = new ArrayList<Operator<?>>();
tempMJParents.add(rs);
mj.setParentOperators(tempMJParents);
// ReduceSink also needs MapJoin as child
List<Operator<?>> rsChildren = rs.getChildOperators();
rsChildren.add(mj);
// Since the MapJoin has had all of its other parents removed at this point,
// it would be bad here if processReduceSinkToHashJoin() tries to do anything
// with the RS parent based on its position in the list of parents.
ReduceSinkMapJoinProc.processReduceSinkToHashJoin(rs, mj, context);
// Remove any parents from MapJoin again
mj.removeParents();
// TODO: do we also need to remove the MapJoin from the list of RS's children?
}
}
if (!context.connectedReduceSinks.contains(rs)) {
// add dependency between the two work items
TezEdgeProperty edgeProp;
EdgeType edgeType = GenTezUtils.determineEdgeType(work, followingWork, rs);
if (rWork.isAutoReduceParallelism()) {
edgeProp = new TezEdgeProperty(context.conf, edgeType, true, rWork.getMinReduceTasks(), rWork.getMaxReduceTasks(), bytesPerReducer);
} else {
edgeProp = new TezEdgeProperty(edgeType);
}
tezWork.connect(work, followingWork, edgeProp);
context.connectedReduceSinks.add(rs);
}
}
} else {
LOG.debug("First pass. Leaf operator: " + operator);
}
// the next item will be a new root.
if (!operator.getChildOperators().isEmpty()) {
assert operator.getChildOperators().size() == 1;
context.parentOfRoot = operator;
context.currentRootOperator = operator.getChildOperators().get(0);
context.preceedingWork = work;
}
return null;
}
use of org.apache.hadoop.hive.ql.exec.DummyStoreOperator in project hive by apache.
the class ReduceRecordProcessor method init.
@Override
void init(MRTaskReporter mrReporter, Map<String, LogicalInput> inputs, Map<String, LogicalOutput> outputs) throws Exception {
perfLogger.PerfLogBegin(CLASS_NAME, PerfLogger.TEZ_INIT_OPERATORS);
super.init(mrReporter, inputs, outputs);
MapredContext.init(false, new JobConf(jconf));
List<LogicalInput> shuffleInputs = getShuffleInputs(inputs);
// TODO HIVE-14042. Move to using a loop and a timed wait once TEZ-3302 is fixed.
checkAbortCondition();
if (shuffleInputs != null) {
l4j.info("Waiting for ShuffleInputs to become ready");
processorContext.waitForAllInputsReady(new ArrayList<Input>(shuffleInputs));
}
connectOps.clear();
ReduceWork redWork = reduceWork;
l4j.info("Main work is " + reduceWork.getName());
List<HashTableDummyOperator> workOps = reduceWork.getDummyOps();
HashSet<HashTableDummyOperator> dummyOps = workOps == null ? null : new HashSet<>(workOps);
tagToReducerMap.put(redWork.getTag(), redWork);
if (mergeWorkList != null) {
for (BaseWork mergeWork : mergeWorkList) {
if (l4j.isDebugEnabled()) {
l4j.debug("Additional work " + mergeWork.getName());
}
workOps = mergeWork.getDummyOps();
if (workOps != null) {
if (dummyOps == null) {
dummyOps = new HashSet<>(workOps);
} else {
dummyOps.addAll(workOps);
}
}
ReduceWork mergeReduceWork = (ReduceWork) mergeWork;
reducer = mergeReduceWork.getReducer();
// Check immediately after reducer is assigned, in cae the abort came in during
checkAbortCondition();
DummyStoreOperator dummyStoreOp = getJoinParentOp(reducer);
connectOps.put(mergeReduceWork.getTag(), dummyStoreOp);
tagToReducerMap.put(mergeReduceWork.getTag(), mergeReduceWork);
}
((TezContext) MapredContext.get()).setDummyOpsMap(connectOps);
}
checkAbortCondition();
bigTablePosition = (byte) reduceWork.getTag();
ObjectInspector[] mainWorkOIs = null;
((TezContext) MapredContext.get()).setInputs(inputs);
((TezContext) MapredContext.get()).setTezProcessorContext(processorContext);
int numTags = reduceWork.getTagToValueDesc().size();
reducer = reduceWork.getReducer();
// Check immediately after reducer is assigned, in cae the abort came in during
checkAbortCondition();
// set memory available for operators
long memoryAvailableToTask = processorContext.getTotalMemoryAvailableToTask();
if (reducer.getConf() != null) {
reducer.getConf().setMaxMemoryAvailable(memoryAvailableToTask);
l4j.info("Memory available for operators set to {}", LlapUtil.humanReadableByteCount(memoryAvailableToTask));
}
OperatorUtils.setMemoryAvailable(reducer.getChildOperators(), memoryAvailableToTask);
// Setup values registry
String valueRegistryKey = DynamicValue.DYNAMIC_VALUE_REGISTRY_CACHE_KEY;
DynamicValueRegistryTez registryTez = dynamicValueCache.retrieve(valueRegistryKey, new Callable<DynamicValueRegistryTez>() {
@Override
public DynamicValueRegistryTez call() {
return new DynamicValueRegistryTez();
}
});
dynamicValueCacheKeys.add(valueRegistryKey);
RegistryConfTez registryConf = new RegistryConfTez(jconf, reduceWork, processorContext, inputs);
registryTez.init(registryConf);
checkAbortCondition();
if (numTags > 1) {
sources = new ReduceRecordSource[numTags];
mainWorkOIs = new ObjectInspector[numTags];
initializeMultipleSources(reduceWork, numTags, mainWorkOIs, sources);
((TezContext) MapredContext.get()).setRecordSources(sources);
reducer.initialize(jconf, mainWorkOIs);
} else {
numTags = tagToReducerMap.keySet().size();
sources = new ReduceRecordSource[numTags];
mainWorkOIs = new ObjectInspector[numTags];
for (int i : tagToReducerMap.keySet()) {
redWork = tagToReducerMap.get(i);
reducer = redWork.getReducer();
// Check immediately after reducer is assigned, in cae the abort came in during
checkAbortCondition();
initializeSourceForTag(redWork, i, mainWorkOIs, sources, redWork.getTagToValueDesc().get(0), redWork.getTagToInput().get(0));
reducer.initializeLocalWork(jconf);
}
reducer = reduceWork.getReducer();
// Check immediately after reducer is assigned, in cae the abort came in during
checkAbortCondition();
((TezContext) MapredContext.get()).setRecordSources(sources);
reducer.initialize(jconf, new ObjectInspector[] { mainWorkOIs[bigTablePosition] });
for (int i : tagToReducerMap.keySet()) {
if (i == bigTablePosition) {
continue;
}
redWork = tagToReducerMap.get(i);
reducer = redWork.getReducer();
// Check immediately after reducer is assigned, in cae the abort came in during
checkAbortCondition();
reducer.initialize(jconf, new ObjectInspector[] { mainWorkOIs[i] });
}
}
checkAbortCondition();
reducer = reduceWork.getReducer();
// initialize reduce operator tree
try {
l4j.info(reducer.dump(0));
// dummy parent operators as well.
if (dummyOps != null) {
for (HashTableDummyOperator dummyOp : dummyOps) {
// TODO HIVE-14042. Propagating abort to dummyOps.
dummyOp.initialize(jconf, null);
checkAbortCondition();
}
}
// set output collector for any reduce sink operators in the pipeline.
List<Operator<?>> children = new LinkedList<Operator<?>>();
children.add(reducer);
if (dummyOps != null) {
children.addAll(dummyOps);
}
createOutputMap();
OperatorUtils.setChildrenCollector(children, outMap);
checkAbortCondition();
reducer.setReporter(reporter);
MapredContext.get().setReporter(reporter);
} catch (Throwable e) {
super.setAborted(true);
if (e instanceof OutOfMemoryError) {
// Don't create a new object if we are already out of memory
throw (OutOfMemoryError) e;
} else if (e instanceof InterruptedException) {
l4j.info("Hit an interrupt while initializing ReduceRecordProcessor. Message={}", e.getMessage());
throw (InterruptedException) e;
} else {
throw new RuntimeException("Reduce operator initialization failed", e);
}
}
perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.TEZ_INIT_OPERATORS);
}
use of org.apache.hadoop.hive.ql.exec.DummyStoreOperator in project hive by apache.
the class AbstractSMBJoinProc method convertBucketMapJoinToSMBJoin.
// Convert the bucket map-join operator to a sort-merge map join operator
protected SMBMapJoinOperator convertBucketMapJoinToSMBJoin(MapJoinOperator mapJoinOp, SortBucketJoinProcCtx smbJoinContext) {
String[] srcs = smbJoinContext.getSrcs();
SMBMapJoinOperator smbJop = new SMBMapJoinOperator(mapJoinOp);
SMBJoinDesc smbJoinDesc = new SMBJoinDesc(mapJoinOp.getConf());
smbJop.setConf(smbJoinDesc);
HashMap<Byte, String> tagToAlias = new HashMap<Byte, String>();
for (int i = 0; i < srcs.length; i++) {
tagToAlias.put((byte) i, srcs[i]);
}
smbJoinDesc.setTagToAlias(tagToAlias);
int indexInListMapJoinNoReducer = this.pGraphContext.getListMapJoinOpsNoReducer().indexOf(mapJoinOp);
if (indexInListMapJoinNoReducer >= 0) {
this.pGraphContext.getListMapJoinOpsNoReducer().remove(indexInListMapJoinNoReducer);
this.pGraphContext.getListMapJoinOpsNoReducer().add(indexInListMapJoinNoReducer, smbJop);
}
Map<String, DummyStoreOperator> aliasToSink = new HashMap<String, DummyStoreOperator>();
// For all parents (other than the big table), insert a dummy store operator
/* Consider a query like:
*
* select * from
* (subq1 --> has a filter)
* join
* (subq2 --> has a filter)
* on some key
*
* Let us assume that subq1 is the small table (either specified by the user or inferred
* automatically). The following operator tree will be created:
*
* TableScan (subq1) --> Select --> Filter --> DummyStore
* \
* \ SMBJoin
* /
* /
* TableScan (subq2) --> Select --> Filter
*/
List<Operator<? extends OperatorDesc>> parentOperators = mapJoinOp.getParentOperators();
for (int i = 0; i < parentOperators.size(); i++) {
Operator<? extends OperatorDesc> par = parentOperators.get(i);
int index = par.getChildOperators().indexOf(mapJoinOp);
par.getChildOperators().remove(index);
if (i == smbJoinDesc.getPosBigTable()) {
par.getChildOperators().add(index, smbJop);
} else {
DummyStoreOperator dummyStoreOp = new DummyStoreOperator(par.getCompilationOpContext());
par.getChildOperators().add(index, dummyStoreOp);
List<Operator<? extends OperatorDesc>> childrenOps = new ArrayList<Operator<? extends OperatorDesc>>();
childrenOps.add(smbJop);
dummyStoreOp.setChildOperators(childrenOps);
List<Operator<? extends OperatorDesc>> parentOps = new ArrayList<Operator<? extends OperatorDesc>>();
parentOps.add(par);
dummyStoreOp.setParentOperators(parentOps);
aliasToSink.put(srcs[i], dummyStoreOp);
smbJop.getParentOperators().remove(i);
smbJop.getParentOperators().add(i, dummyStoreOp);
}
}
smbJoinDesc.setAliasToSink(aliasToSink);
List<Operator<? extends OperatorDesc>> childOps = mapJoinOp.getChildOperators();
for (int i = 0; i < childOps.size(); i++) {
Operator<? extends OperatorDesc> child = childOps.get(i);
int index = child.getParentOperators().indexOf(mapJoinOp);
child.getParentOperators().remove(index);
child.getParentOperators().add(index, smbJop);
}
// Data structures coming from QBJoinTree
smbJop.getConf().setQBJoinTreeProps(mapJoinOp.getConf());
//
pGraphContext.getSmbMapJoinOps().add(smbJop);
pGraphContext.getMapJoinOps().remove(mapJoinOp);
return smbJop;
}
use of org.apache.hadoop.hive.ql.exec.DummyStoreOperator in project hive by apache.
the class SortMergeJoinTaskDispatcher method genSMBJoinWork.
// Convert the work in the SMB plan to a regular join
// Note that the operator tree is not fixed, only the path/alias mappings in the
// plan are fixed. The operator tree will still contain the SMBJoinOperator
private void genSMBJoinWork(MapWork currWork, SMBMapJoinOperator smbJoinOp) {
// Remove the paths which are not part of aliasToPartitionInfo
Map<String, PartitionDesc> aliasToPartitionInfo = currWork.getAliasToPartnInfo();
List<Path> removePaths = new ArrayList<>();
for (Map.Entry<Path, ArrayList<String>> entry : currWork.getPathToAliases().entrySet()) {
boolean keepPath = false;
for (String alias : entry.getValue()) {
if (aliasToPartitionInfo.containsKey(alias)) {
keepPath = true;
break;
}
}
// Remove if the path is not present
if (!keepPath) {
removePaths.add(entry.getKey());
}
}
List<String> removeAliases = new ArrayList<String>();
for (Path removePath : removePaths) {
removeAliases.addAll(currWork.getPathToAliases().get(removePath));
currWork.removePathToAlias(removePath);
currWork.removePathToPartitionInfo(removePath);
}
for (String alias : removeAliases) {
currWork.getAliasToPartnInfo().remove(alias);
currWork.getAliasToWork().remove(alias);
}
// Get the MapredLocalWork
MapredLocalWork localWork = smbJoinOp.getConf().getLocalWork();
for (Map.Entry<String, Operator<? extends OperatorDesc>> entry : localWork.getAliasToWork().entrySet()) {
String alias = entry.getKey();
Operator<? extends OperatorDesc> op = entry.getValue();
FetchWork fetchWork = localWork.getAliasToFetchWork().get(alias);
// Add the entry in mapredwork
currWork.getAliasToWork().put(alias, op);
PartitionDesc partitionInfo = currWork.getAliasToPartnInfo().get(alias);
if (fetchWork.getTblDir() != null) {
currWork.mergeAliasedInput(alias, fetchWork.getTblDir(), partitionInfo);
} else {
for (Path pathDir : fetchWork.getPartDir()) {
currWork.mergeAliasedInput(alias, pathDir, partitionInfo);
}
}
}
// Remove the dummy store operator from the tree
for (Operator<? extends OperatorDesc> parentOp : smbJoinOp.getParentOperators()) {
if (parentOp instanceof DummyStoreOperator) {
Operator<? extends OperatorDesc> grandParentOp = parentOp.getParentOperators().get(0);
smbJoinOp.replaceParent(parentOp, grandParentOp);
grandParentOp.setChildOperators(parentOp.getChildOperators());
parentOp.setParentOperators(null);
parentOp.setParentOperators(null);
}
}
}
use of org.apache.hadoop.hive.ql.exec.DummyStoreOperator in project hive by apache.
the class ConvertJoinMapJoin method convertJoinSMBJoin.
// replaces the join operator with a new CommonJoinOperator, removes the
// parent reduce sinks
private void convertJoinSMBJoin(JoinOperator joinOp, OptimizeTezProcContext context, int mapJoinConversionPos, int numBuckets, boolean adjustParentsChildren) throws SemanticException {
MapJoinDesc mapJoinDesc = null;
if (adjustParentsChildren) {
mapJoinDesc = MapJoinProcessor.getMapJoinDesc(context.conf, joinOp, joinOp.getConf().isLeftInputJoin(), joinOp.getConf().getBaseSrc(), joinOp.getConf().getMapAliases(), mapJoinConversionPos, true);
} else {
JoinDesc joinDesc = joinOp.getConf();
// retain the original join desc in the map join.
mapJoinDesc = new MapJoinDesc(MapJoinProcessor.getKeys(joinOp.getConf().isLeftInputJoin(), joinOp.getConf().getBaseSrc(), joinOp).getSecond(), null, joinDesc.getExprs(), null, null, joinDesc.getOutputColumnNames(), mapJoinConversionPos, joinDesc.getConds(), joinDesc.getFilters(), joinDesc.getNoOuterJoin(), null);
mapJoinDesc.setNullSafes(joinDesc.getNullSafes());
mapJoinDesc.setFilterMap(joinDesc.getFilterMap());
mapJoinDesc.setResidualFilterExprs(joinDesc.getResidualFilterExprs());
mapJoinDesc.resetOrder();
}
CommonMergeJoinOperator mergeJoinOp = (CommonMergeJoinOperator) OperatorFactory.get(joinOp.getCompilationOpContext(), new CommonMergeJoinDesc(numBuckets, mapJoinConversionPos, mapJoinDesc), joinOp.getSchema());
int numReduceSinks = joinOp.getOpTraits().getNumReduceSinks();
OpTraits opTraits = new OpTraits(joinOp.getOpTraits().getBucketColNames(), numBuckets, joinOp.getOpTraits().getSortCols(), numReduceSinks);
mergeJoinOp.setOpTraits(opTraits);
mergeJoinOp.setStatistics(joinOp.getStatistics());
for (Operator<? extends OperatorDesc> parentOp : joinOp.getParentOperators()) {
int pos = parentOp.getChildOperators().indexOf(joinOp);
parentOp.getChildOperators().remove(pos);
parentOp.getChildOperators().add(pos, mergeJoinOp);
}
for (Operator<? extends OperatorDesc> childOp : joinOp.getChildOperators()) {
int pos = childOp.getParentOperators().indexOf(joinOp);
childOp.getParentOperators().remove(pos);
childOp.getParentOperators().add(pos, mergeJoinOp);
}
List<Operator<? extends OperatorDesc>> childOperators = mergeJoinOp.getChildOperators();
List<Operator<? extends OperatorDesc>> parentOperators = mergeJoinOp.getParentOperators();
childOperators.clear();
parentOperators.clear();
childOperators.addAll(joinOp.getChildOperators());
parentOperators.addAll(joinOp.getParentOperators());
mergeJoinOp.getConf().setGenJoinKeys(false);
if (adjustParentsChildren) {
mergeJoinOp.getConf().setGenJoinKeys(true);
List<Operator<? extends OperatorDesc>> newParentOpList = new ArrayList<Operator<? extends OperatorDesc>>();
for (Operator<? extends OperatorDesc> parentOp : mergeJoinOp.getParentOperators()) {
for (Operator<? extends OperatorDesc> grandParentOp : parentOp.getParentOperators()) {
grandParentOp.getChildOperators().remove(parentOp);
grandParentOp.getChildOperators().add(mergeJoinOp);
newParentOpList.add(grandParentOp);
}
}
mergeJoinOp.getParentOperators().clear();
mergeJoinOp.getParentOperators().addAll(newParentOpList);
List<Operator<? extends OperatorDesc>> parentOps = new ArrayList<Operator<? extends OperatorDesc>>(mergeJoinOp.getParentOperators());
for (Operator<? extends OperatorDesc> parentOp : parentOps) {
int parentIndex = mergeJoinOp.getParentOperators().indexOf(parentOp);
if (parentIndex == mapJoinConversionPos) {
continue;
}
// insert the dummy store operator here
DummyStoreOperator dummyStoreOp = new TezDummyStoreOperator(mergeJoinOp.getCompilationOpContext());
dummyStoreOp.setParentOperators(new ArrayList<Operator<? extends OperatorDesc>>());
dummyStoreOp.setChildOperators(new ArrayList<Operator<? extends OperatorDesc>>());
dummyStoreOp.getChildOperators().add(mergeJoinOp);
int index = parentOp.getChildOperators().indexOf(mergeJoinOp);
parentOp.getChildOperators().remove(index);
parentOp.getChildOperators().add(index, dummyStoreOp);
dummyStoreOp.getParentOperators().add(parentOp);
mergeJoinOp.getParentOperators().remove(parentIndex);
mergeJoinOp.getParentOperators().add(parentIndex, dummyStoreOp);
}
}
mergeJoinOp.cloneOriginalParentsList(mergeJoinOp.getParentOperators());
}
Aggregations