use of org.apache.hadoop.hive.ql.exec.MapJoinOperator in project hive by apache.
the class GenMapRedUtils method splitTasks.
@SuppressWarnings("nls")
private static /**
* Split two tasks by creating a temporary file between them.
*
* @param op reduce sink operator being processed
* @param parentTask the parent task
* @param childTask the child task
* @param opProcCtx context
*/
void splitTasks(ReduceSinkOperator op, Task<? extends Serializable> parentTask, Task<? extends Serializable> childTask, GenMRProcContext opProcCtx) throws SemanticException {
if (op.getNumParent() != 1) {
throw new IllegalStateException("Expecting operator " + op + " to have one parent. " + "But found multiple parents : " + op.getParentOperators());
}
ParseContext parseCtx = opProcCtx.getParseCtx();
parentTask.addDependentTask(childTask);
// Root Task cannot depend on any other task, therefore childTask cannot be
// a root Task
List<Task<? extends Serializable>> rootTasks = opProcCtx.getRootTasks();
if (rootTasks.contains(childTask)) {
rootTasks.remove(childTask);
}
// Generate the temporary file name
Context baseCtx = parseCtx.getContext();
Path taskTmpDir = baseCtx.getMRTmpPath();
Operator<? extends OperatorDesc> parent = op.getParentOperators().get(0);
TableDesc tt_desc = PlanUtils.getIntermediateFileTableDesc(PlanUtils.getFieldSchemasFromRowSchema(parent.getSchema(), "temporarycol"));
// Create the temporary file, its corresponding FileSinkOperaotr, and
// its corresponding TableScanOperator.
TableScanOperator tableScanOp = createTemporaryFile(parent, op, taskTmpDir, tt_desc, parseCtx);
Map<Operator<? extends OperatorDesc>, GenMapRedCtx> mapCurrCtx = opProcCtx.getMapCurrCtx();
mapCurrCtx.put(tableScanOp, new GenMapRedCtx(childTask, null));
String streamDesc = taskTmpDir.toUri().toString();
MapredWork cplan = (MapredWork) childTask.getWork();
if (needsTagging(cplan.getReduceWork())) {
Operator<? extends OperatorDesc> reducerOp = cplan.getReduceWork().getReducer();
String id = null;
if (reducerOp instanceof JoinOperator) {
if (parseCtx.getJoinOps().contains(reducerOp)) {
id = ((JoinOperator) reducerOp).getConf().getId();
}
} else if (reducerOp instanceof MapJoinOperator) {
if (parseCtx.getMapJoinOps().contains(reducerOp)) {
id = ((MapJoinOperator) reducerOp).getConf().getId();
}
} else if (reducerOp instanceof SMBMapJoinOperator) {
if (parseCtx.getSmbMapJoinOps().contains(reducerOp)) {
id = ((SMBMapJoinOperator) reducerOp).getConf().getId();
}
}
if (id != null) {
streamDesc = id + ":$INTNAME";
} else {
streamDesc = "$INTNAME";
}
String origStreamDesc = streamDesc;
int pos = 0;
while (cplan.getMapWork().getAliasToWork().get(streamDesc) != null) {
streamDesc = origStreamDesc.concat(String.valueOf(++pos));
}
// TODO: Allocate work to remove the temporary files and make that
// dependent on the redTask
cplan.getReduceWork().setNeedsTagging(true);
}
// Add the path to alias mapping
setTaskPlan(taskTmpDir, streamDesc, tableScanOp, cplan.getMapWork(), false, tt_desc);
opProcCtx.setCurrTopOp(null);
opProcCtx.setCurrAliasId(null);
opProcCtx.setCurrTask(childTask);
opProcCtx.addRootIfPossible(parentTask);
}
use of org.apache.hadoop.hive.ql.exec.MapJoinOperator in project hive by apache.
the class MapJoinProcessor method genMapJoinOpAndLocalWork.
/**
* Convert the join to a map-join and also generate any local work needed.
*
* @param newWork MapredWork in which the conversion is to happen
* @param op
* The join operator that needs to be converted to map-join
* @param mapJoinPos
* @throws SemanticException
*/
public static void genMapJoinOpAndLocalWork(HiveConf conf, MapredWork newWork, JoinOperator op, int mapJoinPos) throws SemanticException {
// generate the map join operator; already checked the map join
MapJoinOperator newMapJoinOp = new MapJoinProcessor().convertMapJoin(conf, op, newWork.getMapWork().isLeftInputJoin(), newWork.getMapWork().getBaseSrc(), newWork.getMapWork().getMapAliases(), mapJoinPos, true, false);
genLocalWorkForMapJoin(newWork, newMapJoinOp, mapJoinPos);
}
use of org.apache.hadoop.hive.ql.exec.MapJoinOperator in project hive by apache.
the class MapJoinProcessor method convertSMBJoinToMapJoin.
/**
* convert a sortmerge join to a a map-side join.
*
* @param opParseCtxMap
* @param smbJoinOp
* join operator
* @param joinTree
* qb join tree
* @param bigTablePos
* position of the source to be read as part of map-reduce framework. All other sources
* are cached in memory
* @param noCheckOuterJoin
*/
public static MapJoinOperator convertSMBJoinToMapJoin(HiveConf hconf, SMBMapJoinOperator smbJoinOp, int bigTablePos, boolean noCheckOuterJoin) throws SemanticException {
// Create a new map join operator
SMBJoinDesc smbJoinDesc = smbJoinOp.getConf();
List<ExprNodeDesc> keyCols = smbJoinDesc.getKeys().get(Byte.valueOf((byte) 0));
TableDesc keyTableDesc = PlanUtils.getMapJoinKeyTableDesc(hconf, PlanUtils.getFieldSchemasFromColumnList(keyCols, MAPJOINKEY_FIELDPREFIX));
MapJoinDesc mapJoinDesc = new MapJoinDesc(smbJoinDesc.getKeys(), keyTableDesc, smbJoinDesc.getExprs(), smbJoinDesc.getValueTblDescs(), smbJoinDesc.getValueTblDescs(), smbJoinDesc.getOutputColumnNames(), bigTablePos, smbJoinDesc.getConds(), smbJoinDesc.getFilters(), smbJoinDesc.isNoOuterJoin(), smbJoinDesc.getDumpFilePrefix(), smbJoinDesc.getMemoryMonitorInfo(), smbJoinDesc.getInMemoryDataSize());
mapJoinDesc.setStatistics(smbJoinDesc.getStatistics());
mapJoinDesc.setColumnExprMap(smbJoinDesc.getColumnExprMap());
RowSchema joinRS = smbJoinOp.getSchema();
// The mapjoin has the same schema as the join operator
MapJoinOperator mapJoinOp = (MapJoinOperator) OperatorFactory.getAndMakeChild(smbJoinOp.getCompilationOpContext(), mapJoinDesc, joinRS, new ArrayList<Operator<? extends OperatorDesc>>());
// change the children of the original join operator to point to the map
// join operator
List<Operator<? extends OperatorDesc>> childOps = smbJoinOp.getChildOperators();
for (Operator<? extends OperatorDesc> childOp : childOps) {
childOp.replaceParent(smbJoinOp, mapJoinOp);
}
mapJoinOp.setChildOperators(childOps);
smbJoinOp.setChildOperators(null);
// change the parent of the original SMBjoin operator to point to the map
// join operator
List<Operator<? extends OperatorDesc>> parentOps = smbJoinOp.getParentOperators();
for (Operator<? extends OperatorDesc> parentOp : parentOps) {
parentOp.replaceChild(smbJoinOp, mapJoinOp);
}
mapJoinOp.setParentOperators(parentOps);
smbJoinOp.setParentOperators(null);
return mapJoinOp;
}
use of org.apache.hadoop.hive.ql.exec.MapJoinOperator in project hive by apache.
the class SharedWorkOptimizer method extractSharedOptimizationInfo.
private static SharedResult extractSharedOptimizationInfo(ParseContext pctx, SharedWorkOptimizerCache optimizerCache, Operator<?> retainableOpEqualParent, Operator<?> discardableOpEqualParent, Operator<?> retainableOp, Operator<?> discardableOp, LinkedHashSet<Operator<?>> retainableOps, LinkedHashSet<Operator<?>> discardableOps, Set<Operator<?>> discardableInputOps, boolean removeInputBranch) throws SemanticException {
Operator<?> equalOp1 = retainableOpEqualParent;
Operator<?> equalOp2 = discardableOpEqualParent;
Operator<?> currentOp1 = retainableOp;
Operator<?> currentOp2 = discardableOp;
long dataSize = 0L;
long maxDataSize = 0L;
// Try to merge rest of operators
while (!(currentOp1 instanceof ReduceSinkOperator)) {
// Check whether current operators are equal
if (!compareOperator(pctx, currentOp1, currentOp2)) {
// If they are not equal, we could zip up till here
break;
}
if (currentOp1.getParentOperators().size() != currentOp2.getParentOperators().size()) {
// If they are not equal, we could zip up till here
break;
}
if (currentOp1.getParentOperators().size() > 1) {
List<Operator<?>> discardableOpsForCurrentOp = new ArrayList<>();
int idx = 0;
for (; idx < currentOp1.getParentOperators().size(); idx++) {
Operator<?> parentOp1 = currentOp1.getParentOperators().get(idx);
Operator<?> parentOp2 = currentOp2.getParentOperators().get(idx);
if (parentOp1 == equalOp1 && parentOp2 == equalOp2 && !removeInputBranch) {
continue;
}
if ((parentOp1 == equalOp1 && parentOp2 != equalOp2) || (parentOp1 != equalOp1 && parentOp2 == equalOp2)) {
// Input operator is not in the same position
break;
}
// Compare input
List<Operator<?>> removeOpsForCurrentInput = compareAndGatherOps(pctx, parentOp1, parentOp2);
if (removeOpsForCurrentInput == null) {
// Inputs are not the same, bail out
break;
}
// Add inputs to ops to remove
discardableOpsForCurrentOp.addAll(removeOpsForCurrentInput);
}
if (idx != currentOp1.getParentOperators().size()) {
// If inputs are not equal, we could zip up till here
break;
}
discardableInputOps.addAll(discardableOpsForCurrentOp);
}
equalOp1 = currentOp1;
equalOp2 = currentOp2;
retainableOps.add(equalOp1);
discardableOps.add(equalOp2);
if (equalOp1 instanceof MapJoinOperator) {
MapJoinOperator mop = (MapJoinOperator) equalOp1;
dataSize = StatsUtils.safeAdd(dataSize, mop.getConf().getInMemoryDataSize());
maxDataSize = mop.getConf().getMemoryMonitorInfo().getAdjustedNoConditionalTaskSize();
}
if (currentOp1.getChildOperators().size() > 1 || currentOp2.getChildOperators().size() > 1) {
// TODO: Support checking multiple child operators to merge further.
break;
}
// Update for next iteration
currentOp1 = currentOp1.getChildOperators().get(0);
currentOp2 = currentOp2.getChildOperators().get(0);
}
// Add the rest to the memory consumption
Set<Operator<?>> opsWork1 = findWorkOperators(optimizerCache, currentOp1);
for (Operator<?> op : opsWork1) {
if (op instanceof MapJoinOperator && !retainableOps.contains(op)) {
MapJoinOperator mop = (MapJoinOperator) op;
dataSize = StatsUtils.safeAdd(dataSize, mop.getConf().getInMemoryDataSize());
maxDataSize = mop.getConf().getMemoryMonitorInfo().getAdjustedNoConditionalTaskSize();
}
}
Set<Operator<?>> opsWork2 = findWorkOperators(optimizerCache, currentOp2);
for (Operator<?> op : opsWork2) {
if (op instanceof MapJoinOperator && !discardableOps.contains(op)) {
MapJoinOperator mop = (MapJoinOperator) op;
dataSize = StatsUtils.safeAdd(dataSize, mop.getConf().getInMemoryDataSize());
maxDataSize = mop.getConf().getMemoryMonitorInfo().getAdjustedNoConditionalTaskSize();
}
}
discardableInputOps.addAll(gatherDPPBranchOps(pctx, optimizerCache, discardableInputOps));
discardableInputOps.addAll(gatherDPPBranchOps(pctx, optimizerCache, discardableOps));
discardableInputOps.addAll(gatherDPPBranchOps(pctx, optimizerCache, retainableOps, discardableInputOps));
return new SharedResult(retainableOps, discardableOps, discardableInputOps, dataSize, maxDataSize);
}
use of org.apache.hadoop.hive.ql.exec.MapJoinOperator in project hive by apache.
the class SparkMapJoinProcessor method convertMapJoin.
/**
* convert a regular join to a a map-side join.
*
* @param conf
* @param opParseCtxMap
* @param op join operator
* @param joinTree qb join tree
* @param bigTablePos position of the source to be read as part of
* map-reduce framework. All other sources are cached in memory
* @param noCheckOuterJoin
* @param validateMapJoinTree
*/
@Override
public MapJoinOperator convertMapJoin(HiveConf conf, JoinOperator op, boolean leftSrc, String[] baseSrc, List<String> mapAliases, int bigTablePos, boolean noCheckOuterJoin, boolean validateMapJoinTree) throws SemanticException {
// outer join cannot be performed on a table which is being cached
JoinCondDesc[] condns = op.getConf().getConds();
if (!noCheckOuterJoin) {
if (checkMapJoin(bigTablePos, condns) < 0) {
throw new SemanticException(ErrorMsg.NO_OUTER_MAPJOIN.getMsg());
}
}
// create the map-join operator
MapJoinOperator mapJoinOp = convertJoinOpMapJoinOp(conf, op, op.getConf().isLeftInputJoin(), op.getConf().getBaseSrc(), op.getConf().getMapAliases(), bigTablePos, noCheckOuterJoin);
// 1. remove RS as parent for the big table branch
// 2. remove old join op from child set of all the RSs
List<Operator<? extends OperatorDesc>> parentOps = mapJoinOp.getParentOperators();
for (int i = 0; i < parentOps.size(); i++) {
Operator<? extends OperatorDesc> parentOp = parentOps.get(i);
parentOp.getChildOperators().remove(op);
if (i == bigTablePos) {
List<Operator<? extends OperatorDesc>> grandParentOps = parentOp.getParentOperators();
Preconditions.checkArgument(grandParentOps.size() == 1, "AssertionError: expect number of parents to be 1, but was " + grandParentOps.size());
Operator<? extends OperatorDesc> grandParentOp = grandParentOps.get(0);
grandParentOp.replaceChild(parentOp, mapJoinOp);
mapJoinOp.replaceParent(parentOp, grandParentOp);
}
}
return mapJoinOp;
}
Aggregations