use of org.apache.hadoop.hive.ql.exec.JoinOperator in project hive by apache.
the class GenMapRedUtils method splitTasks.
@SuppressWarnings("nls")
private static /**
* Split two tasks by creating a temporary file between them.
*
* @param op reduce sink operator being processed
* @param parentTask the parent task
* @param childTask the child task
* @param opProcCtx context
**/
void splitTasks(ReduceSinkOperator op, Task<? extends Serializable> parentTask, Task<? extends Serializable> childTask, GenMRProcContext opProcCtx) throws SemanticException {
if (op.getNumParent() != 1) {
throw new IllegalStateException("Expecting operator " + op + " to have one parent. " + "But found multiple parents : " + op.getParentOperators());
}
ParseContext parseCtx = opProcCtx.getParseCtx();
parentTask.addDependentTask(childTask);
// Root Task cannot depend on any other task, therefore childTask cannot be
// a root Task
List<Task<? extends Serializable>> rootTasks = opProcCtx.getRootTasks();
if (rootTasks.contains(childTask)) {
rootTasks.remove(childTask);
}
// Generate the temporary file name
Context baseCtx = parseCtx.getContext();
Path taskTmpDir = baseCtx.getMRTmpPath();
Operator<? extends OperatorDesc> parent = op.getParentOperators().get(0);
TableDesc tt_desc = PlanUtils.getIntermediateFileTableDesc(PlanUtils.getFieldSchemasFromRowSchema(parent.getSchema(), "temporarycol"));
// Create the temporary file, its corresponding FileSinkOperaotr, and
// its corresponding TableScanOperator.
TableScanOperator tableScanOp = createTemporaryFile(parent, op, taskTmpDir, tt_desc, parseCtx);
Map<Operator<? extends OperatorDesc>, GenMapRedCtx> mapCurrCtx = opProcCtx.getMapCurrCtx();
mapCurrCtx.put(tableScanOp, new GenMapRedCtx(childTask, null));
String streamDesc = taskTmpDir.toUri().toString();
MapredWork cplan = (MapredWork) childTask.getWork();
if (needsTagging(cplan.getReduceWork())) {
Operator<? extends OperatorDesc> reducerOp = cplan.getReduceWork().getReducer();
String id = null;
if (reducerOp instanceof JoinOperator) {
if (parseCtx.getJoinOps().contains(reducerOp)) {
id = ((JoinOperator) reducerOp).getConf().getId();
}
} else if (reducerOp instanceof MapJoinOperator) {
if (parseCtx.getMapJoinOps().contains(reducerOp)) {
id = ((MapJoinOperator) reducerOp).getConf().getId();
}
} else if (reducerOp instanceof SMBMapJoinOperator) {
if (parseCtx.getSmbMapJoinOps().contains(reducerOp)) {
id = ((SMBMapJoinOperator) reducerOp).getConf().getId();
}
}
if (id != null) {
streamDesc = id + ":$INTNAME";
} else {
streamDesc = "$INTNAME";
}
String origStreamDesc = streamDesc;
int pos = 0;
while (cplan.getMapWork().getAliasToWork().get(streamDesc) != null) {
streamDesc = origStreamDesc.concat(String.valueOf(++pos));
}
// TODO: Allocate work to remove the temporary files and make that
// dependent on the redTask
cplan.getReduceWork().setNeedsTagging(true);
}
// Add the path to alias mapping
setTaskPlan(taskTmpDir, streamDesc, tableScanOp, cplan.getMapWork(), false, tt_desc);
opProcCtx.setCurrTopOp(null);
opProcCtx.setCurrAliasId(null);
opProcCtx.setCurrTask(childTask);
opProcCtx.addRootIfPossible(parentTask);
}
use of org.apache.hadoop.hive.ql.exec.JoinOperator in project hive by apache.
the class SparkMapJoinProcessor method convertMapJoin.
/**
* convert a regular join to a a map-side join.
*
* @param conf
* @param opParseCtxMap
* @param op join operator
* @param joinTree qb join tree
* @param bigTablePos position of the source to be read as part of
* map-reduce framework. All other sources are cached in memory
* @param noCheckOuterJoin
* @param validateMapJoinTree
*/
@Override
public MapJoinOperator convertMapJoin(HiveConf conf, JoinOperator op, boolean leftSrc, String[] baseSrc, List<String> mapAliases, int bigTablePos, boolean noCheckOuterJoin, boolean validateMapJoinTree) throws SemanticException {
// outer join cannot be performed on a table which is being cached
JoinCondDesc[] condns = op.getConf().getConds();
if (!noCheckOuterJoin) {
if (checkMapJoin(bigTablePos, condns) < 0) {
throw new SemanticException(ErrorMsg.NO_OUTER_MAPJOIN.getMsg());
}
}
// create the map-join operator
MapJoinOperator mapJoinOp = convertJoinOpMapJoinOp(conf, op, op.getConf().isLeftInputJoin(), op.getConf().getBaseSrc(), op.getConf().getMapAliases(), bigTablePos, noCheckOuterJoin);
// 1. remove RS as parent for the big table branch
// 2. remove old join op from child set of all the RSs
List<Operator<? extends OperatorDesc>> parentOps = mapJoinOp.getParentOperators();
for (int i = 0; i < parentOps.size(); i++) {
Operator<? extends OperatorDesc> parentOp = parentOps.get(i);
parentOp.getChildOperators().remove(op);
if (i == bigTablePos) {
List<Operator<? extends OperatorDesc>> grandParentOps = parentOp.getParentOperators();
Preconditions.checkArgument(grandParentOps.size() == 1, "AssertionError: expect number of parents to be 1, but was " + grandParentOps.size());
Operator<? extends OperatorDesc> grandParentOp = grandParentOps.get(0);
grandParentOp.replaceChild(parentOp, mapJoinOp);
mapJoinOp.replaceParent(parentOp, grandParentOp);
}
}
return mapJoinOp;
}
use of org.apache.hadoop.hive.ql.exec.JoinOperator in project hive by apache.
the class JoinReorder method transform.
/**
* Transform the query tree. For each join, check which reduce sink will
* output the biggest result (based on STREAMTABLE hints) and give it the
* biggest tag so that it gets streamed.
*
* @param pactx
* current parse context
*/
@Override
public ParseContext transform(ParseContext pactx) throws SemanticException {
Set<String> bigTables = getBigTables(pactx);
cache.clear();
for (JoinOperator joinOp : pactx.getJoinOps()) {
reorder(joinOp, bigTables);
}
return pactx;
}
use of org.apache.hadoop.hive.ql.exec.JoinOperator in project hive by apache.
the class CorrelationOptimizer method findPossibleAutoConvertedJoinOperators.
private void findPossibleAutoConvertedJoinOperators() throws SemanticException {
// based on hive.auto.convert.join.noconditionaltask.size.
for (JoinOperator joinOp : pCtx.getJoinOps()) {
boolean isAbleToGuess = true;
boolean mayConvert = false;
// Get total size and individual alias's size
long aliasTotalKnownInputSize = 0;
Map<String, Long> aliasToSize = new HashMap<String, Long>();
Map<Integer, Set<String>> posToAliases = new HashMap<Integer, Set<String>>();
for (int pos = 0; pos < joinOp.getNumParent(); pos++) {
Operator<? extends OperatorDesc> op = joinOp.getParentOperators().get(pos);
Set<TableScanOperator> topOps = CorrelationUtilities.findTableScanOperators(op);
if (topOps.isEmpty()) {
isAbleToGuess = false;
break;
}
Set<String> aliases = new LinkedHashSet<String>();
for (TableScanOperator tsop : topOps) {
Table table = tsop.getConf().getTableMetadata();
if (table == null) {
// table should not be null.
throw new SemanticException("The table of " + tsop.getName() + " " + tsop.getIdentifier() + " is null, which is not expected.");
}
String alias = tsop.getConf().getAlias();
aliases.add(alias);
Path p = table.getPath();
ContentSummary resultCs = null;
try {
FileSystem fs = table.getPath().getFileSystem(pCtx.getConf());
resultCs = fs.getContentSummary(p);
} catch (IOException e) {
LOG.warn("Encounter a error while querying content summary of table " + table.getCompleteName() + " from FileSystem. " + "Cannot guess if CommonJoinOperator will optimize " + joinOp.getName() + " " + joinOp.getIdentifier());
}
if (resultCs == null) {
isAbleToGuess = false;
break;
}
long size = resultCs.getLength();
aliasTotalKnownInputSize += size;
Long es = aliasToSize.get(alias);
if (es == null) {
es = new Long(0);
}
es += size;
aliasToSize.put(alias, es);
}
posToAliases.put(pos, aliases);
}
if (!isAbleToGuess) {
LOG.info("Cannot guess if CommonJoinOperator will optimize " + joinOp.getName() + " " + joinOp.getIdentifier());
continue;
}
JoinDesc joinDesc = joinOp.getConf();
Byte[] order = joinDesc.getTagOrder();
int numAliases = order.length;
Set<Integer> bigTableCandidates = MapJoinProcessor.getBigTableCandidates(joinDesc.getConds());
if (bigTableCandidates.isEmpty()) {
continue;
}
long ThresholdOfSmallTblSizeSum = HiveConf.getLongVar(pCtx.getConf(), HiveConf.ConfVars.HIVESMALLTABLESFILESIZE);
for (int i = 0; i < numAliases; i++) {
// this table cannot be big table
if (!bigTableCandidates.contains(i)) {
continue;
}
Set<String> aliases = posToAliases.get(i);
long aliasKnownSize = Utilities.sumOf(aliasToSize, aliases);
if (!CommonJoinTaskDispatcher.cannotConvert(aliasKnownSize, aliasTotalKnownInputSize, ThresholdOfSmallTblSizeSum)) {
mayConvert = true;
}
}
if (mayConvert) {
LOG.info(joinOp.getName() + " " + joinOp.getIdentifier() + " may be converted to MapJoin by CommonJoinResolver");
skipedJoinOperators.add(joinOp);
}
}
}
use of org.apache.hadoop.hive.ql.exec.JoinOperator in project hive by apache.
the class HiveOpConverter method translateJoin.
private OpAttr translateJoin(RelNode joinRel) throws SemanticException {
// 0. Additional data structures needed for the join optimization
// through Hive
String[] baseSrc = new String[joinRel.getInputs().size()];
String tabAlias = getHiveDerivedTableAlias();
// 1. Convert inputs
OpAttr[] inputs = new OpAttr[joinRel.getInputs().size()];
List<Operator<?>> children = new ArrayList<Operator<?>>(joinRel.getInputs().size());
for (int i = 0; i < inputs.length; i++) {
inputs[i] = dispatch(joinRel.getInput(i));
children.add(inputs[i].inputs.get(0));
baseSrc[i] = inputs[i].tabAlias;
}
// 2. Generate tags
for (int tag = 0; tag < children.size(); tag++) {
ReduceSinkOperator reduceSinkOp = (ReduceSinkOperator) children.get(tag);
reduceSinkOp.getConf().setTag(tag);
}
// 3. Virtual columns
Set<Integer> newVcolsInCalcite = new HashSet<Integer>();
newVcolsInCalcite.addAll(inputs[0].vcolsInCalcite);
if (joinRel instanceof HiveMultiJoin || !(joinRel instanceof SemiJoin)) {
int shift = inputs[0].inputs.get(0).getSchema().getSignature().size();
for (int i = 1; i < inputs.length; i++) {
newVcolsInCalcite.addAll(HiveCalciteUtil.shiftVColsSet(inputs[i].vcolsInCalcite, shift));
shift += inputs[i].inputs.get(0).getSchema().getSignature().size();
}
}
if (LOG.isDebugEnabled()) {
LOG.debug("Translating operator rel#" + joinRel.getId() + ":" + joinRel.getRelTypeName() + " with row type: [" + joinRel.getRowType() + "]");
}
// 4. Extract join key expressions from HiveSortExchange
ExprNodeDesc[][] joinExpressions = new ExprNodeDesc[inputs.length][];
for (int i = 0; i < inputs.length; i++) {
joinExpressions[i] = ((HiveSortExchange) joinRel.getInput(i)).getJoinExpressions();
}
// 5. Extract rest of join predicate info. We infer the rest of join condition
// that will be added to the filters (join conditions that are not part of
// the join key)
List<RexNode> joinFilters;
if (joinRel instanceof HiveJoin) {
joinFilters = ImmutableList.of(((HiveJoin) joinRel).getJoinFilter());
} else if (joinRel instanceof HiveMultiJoin) {
joinFilters = ((HiveMultiJoin) joinRel).getJoinFilters();
} else if (joinRel instanceof HiveSemiJoin) {
joinFilters = ImmutableList.of(((HiveSemiJoin) joinRel).getJoinFilter());
} else {
throw new SemanticException("Can't handle join type: " + joinRel.getClass().getName());
}
List<List<ExprNodeDesc>> filterExpressions = Lists.newArrayList();
for (int i = 0; i < joinFilters.size(); i++) {
List<ExprNodeDesc> filterExpressionsForInput = new ArrayList<ExprNodeDesc>();
if (joinFilters.get(i) != null) {
for (RexNode conj : RelOptUtil.conjunctions(joinFilters.get(i))) {
ExprNodeDesc expr = convertToExprNode(conj, joinRel, null, newVcolsInCalcite);
filterExpressionsForInput.add(expr);
}
}
filterExpressions.add(filterExpressionsForInput);
}
// 6. Generate Join operator
JoinOperator joinOp = genJoin(joinRel, joinExpressions, filterExpressions, children, baseSrc, tabAlias);
// 7. Return result
return new OpAttr(tabAlias, newVcolsInCalcite, joinOp);
}
Aggregations