use of org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator in project hive by apache.
the class GenSparkUtils method annotateMapWork.
/**
* Fill MapWork with 'local' work and bucket information for SMB Join.
* @param context context, containing references to MapWorks and their SMB information.
* @throws SemanticException
*/
public void annotateMapWork(GenSparkProcContext context) throws SemanticException {
for (SMBMapJoinOperator smbMapJoinOp : context.smbMapJoinCtxMap.keySet()) {
// initialize mapwork with smbMapJoin information.
SparkSMBMapJoinInfo smbMapJoinInfo = context.smbMapJoinCtxMap.get(smbMapJoinOp);
MapWork work = smbMapJoinInfo.mapWork;
SparkSortMergeJoinFactory.annotateMapWork(context, work, smbMapJoinOp, (TableScanOperator) smbMapJoinInfo.bigTableRootOp, false);
for (Operator<?> smallTableRootOp : smbMapJoinInfo.smallTableRootOps) {
SparkSortMergeJoinFactory.annotateMapWork(context, work, smbMapJoinOp, (TableScanOperator) smallTableRootOp, true);
}
}
}
use of org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator in project hive by apache.
the class SemanticAnalyzer method analyzeInternal.
void analyzeInternal(ASTNode ast, PlannerContext plannerCtx) throws SemanticException {
// 1. Generate Resolved Parse tree from syntax tree
LOG.info("Starting Semantic Analysis");
//change the location of position alias process here
processPositionAlias(ast);
if (!genResolvedParseTree(ast, plannerCtx)) {
return;
}
// 2. Gen OP Tree from resolved Parse Tree
Operator sinkOp = genOPTree(ast, plannerCtx);
if (!unparseTranslator.isEnabled() && tableMask.isEnabled()) {
// Here we rewrite the * and also the masking table
ASTNode tree = rewriteASTWithMaskAndFilter(ast);
if (tree != ast) {
ctx.setSkipTableMasking(true);
init(true);
//change the location of position alias process here
processPositionAlias(tree);
genResolvedParseTree(tree, plannerCtx);
if (this instanceof CalcitePlanner) {
((CalcitePlanner) this).resetCalciteConfiguration();
}
sinkOp = genOPTree(tree, plannerCtx);
}
}
// 3. Deduce Resultset Schema
if (createVwDesc != null && !this.ctx.isCboSucceeded()) {
resultSchema = convertRowSchemaToViewSchema(opParseCtx.get(sinkOp).getRowResolver());
} else {
// succeeds.
if (resultSchema == null) {
resultSchema = convertRowSchemaToResultSetSchema(opParseCtx.get(sinkOp).getRowResolver(), HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_RESULTSET_USE_UNIQUE_COLUMN_NAMES));
}
}
// 4. Generate Parse Context for Optimizer & Physical compiler
copyInfoToQueryProperties(queryProperties);
ParseContext pCtx = new ParseContext(queryState, opToPartPruner, opToPartList, topOps, new HashSet<JoinOperator>(joinContext.keySet()), new HashSet<SMBMapJoinOperator>(smbMapJoinContext.keySet()), loadTableWork, loadFileWork, columnStatsAutoGatherContexts, ctx, idToTableNameMap, destTableId, uCtx, listMapJoinOpsNoReducer, prunedPartitions, tabNameToTabObject, opToSamplePruner, globalLimitCtx, nameToSplitSample, inputs, rootTasks, opToPartToSkewedPruner, viewAliasToInput, reduceSinkOperatorsAddedByEnforceBucketingSorting, analyzeRewrite, tableDesc, createVwDesc, queryProperties, viewProjectToTableSchema, acidFileSinks);
// 5. Take care of view creation
if (createVwDesc != null) {
if (ctx.getExplainAnalyze() == AnalyzeState.RUNNING) {
return;
}
if (!ctx.isCboSucceeded()) {
saveViewDefinition();
}
// validate the create view statement at this point, the createVwDesc gets
// all the information for semanticcheck
validateCreateView();
if (!createVwDesc.isMaterialized()) {
// Since we're only creating a view (not executing it), we don't need to
// optimize or translate the plan (and in fact, those procedures can
// interfere with the view creation). So skip the rest of this method.
ctx.setResDir(null);
ctx.setResFile(null);
try {
PlanUtils.addInputsForView(pCtx);
} catch (HiveException e) {
throw new SemanticException(e);
}
// Generate lineage info for create view statements
// if LineageLogger hook is configured.
// Add the transformation that computes the lineage information.
Set<String> postExecHooks = Sets.newHashSet(Splitter.on(",").trimResults().omitEmptyStrings().split(Strings.nullToEmpty(HiveConf.getVar(conf, HiveConf.ConfVars.POSTEXECHOOKS))));
if (postExecHooks.contains("org.apache.hadoop.hive.ql.hooks.PostExecutePrinter") || postExecHooks.contains("org.apache.hadoop.hive.ql.hooks.LineageLogger") || postExecHooks.contains("org.apache.atlas.hive.hook.HiveHook")) {
ArrayList<Transform> transformations = new ArrayList<Transform>();
transformations.add(new HiveOpConverterPostProc());
transformations.add(new Generator());
for (Transform t : transformations) {
pCtx = t.transform(pCtx);
}
// we just use view name as location.
SessionState.get().getLineageState().mapDirToOp(new Path(createVwDesc.getViewName()), sinkOp);
}
return;
}
}
// 6. Generate table access stats if required
if (HiveConf.getBoolVar(this.conf, HiveConf.ConfVars.HIVE_STATS_COLLECT_TABLEKEYS)) {
TableAccessAnalyzer tableAccessAnalyzer = new TableAccessAnalyzer(pCtx);
setTableAccessInfo(tableAccessAnalyzer.analyzeTableAccess());
}
// 7. Perform Logical optimization
if (LOG.isDebugEnabled()) {
LOG.debug("Before logical optimization\n" + Operator.toString(pCtx.getTopOps().values()));
}
Optimizer optm = new Optimizer();
optm.setPctx(pCtx);
optm.initialize(conf);
pCtx = optm.optimize();
if (pCtx.getColumnAccessInfo() != null) {
// set ColumnAccessInfo for view column authorization
setColumnAccessInfo(pCtx.getColumnAccessInfo());
}
FetchTask origFetchTask = pCtx.getFetchTask();
if (LOG.isDebugEnabled()) {
LOG.debug("After logical optimization\n" + Operator.toString(pCtx.getTopOps().values()));
}
// 8. Generate column access stats if required - wait until column pruning
// takes place during optimization
boolean isColumnInfoNeedForAuth = SessionState.get().isAuthorizationModeV2() && HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_AUTHORIZATION_ENABLED);
if (isColumnInfoNeedForAuth || HiveConf.getBoolVar(this.conf, HiveConf.ConfVars.HIVE_STATS_COLLECT_SCANCOLS)) {
ColumnAccessAnalyzer columnAccessAnalyzer = new ColumnAccessAnalyzer(pCtx);
// view column access info is carried by this.getColumnAccessInfo().
setColumnAccessInfo(columnAccessAnalyzer.analyzeColumnAccess(this.getColumnAccessInfo()));
}
// TEZ..)
if (!ctx.getExplainLogical()) {
TaskCompiler compiler = TaskCompilerFactory.getCompiler(conf, pCtx);
compiler.init(queryState, console, db);
compiler.compile(pCtx, rootTasks, inputs, outputs);
fetchTask = pCtx.getFetchTask();
}
LOG.info("Completed plan generation");
// 10. put accessed columns to readEntity
if (HiveConf.getBoolVar(this.conf, HiveConf.ConfVars.HIVE_STATS_COLLECT_SCANCOLS)) {
putAccessedColumnsToReadEntity(inputs, columnAccessInfo);
}
// 11. if desired check we're not going over partition scan limits
if (!ctx.isExplainSkipExecution()) {
enforceScanLimits(pCtx, origFetchTask);
}
return;
}
use of org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator in project hive by apache.
the class SparkCompiler method generateTaskTreeHelper.
private void generateTaskTreeHelper(GenSparkProcContext procCtx, List<Node> topNodes) throws SemanticException {
// create a walker which walks the tree in a DFS manner while maintaining
// the operator stack. The dispatcher generates the plan from the operator tree
Map<SemanticRule, SemanticNodeProcessor> opRules = new LinkedHashMap<SemanticRule, SemanticNodeProcessor>();
GenSparkWork genSparkWork = new GenSparkWork(GenSparkUtils.getUtils());
opRules.put(new RuleRegExp("Split Work - ReduceSink", ReduceSinkOperator.getOperatorName() + "%"), genSparkWork);
opRules.put(new RuleRegExp("Split Work - SparkPartitionPruningSink", SparkPartitionPruningSinkOperator.getOperatorName() + "%"), genSparkWork);
opRules.put(new TypeRule(MapJoinOperator.class), new SparkReduceSinkMapJoinProc());
opRules.put(new RuleRegExp("Split Work + Move/Merge - FileSink", FileSinkOperator.getOperatorName() + "%"), new CompositeProcessor(new SparkFileSinkProcessor(), genSparkWork));
opRules.put(new RuleRegExp("Handle Analyze Command", TableScanOperator.getOperatorName() + "%"), new SparkProcessAnalyzeTable(GenSparkUtils.getUtils()));
opRules.put(new RuleRegExp("Remember union", UnionOperator.getOperatorName() + "%"), new SemanticNodeProcessor() {
@Override
public Object process(Node n, Stack<Node> s, NodeProcessorCtx procCtx, Object... os) throws SemanticException {
GenSparkProcContext context = (GenSparkProcContext) procCtx;
UnionOperator union = (UnionOperator) n;
// simply need to remember that we've seen a union.
context.currentUnionOperators.add(union);
return null;
}
});
/**
* SMB join case: (Big) (Small) (Small)
* TS TS TS
* \ | /
* \ DS DS
* \ | /
* SMBJoinOP
*
* Some of the other processors are expecting only one traversal beyond SMBJoinOp.
* We need to traverse from the big-table path only, and stop traversing on the
* small-table path once we reach SMBJoinOp.
* Also add some SMB join information to the context, so we can properly annotate
* the MapWork later on.
*/
opRules.put(new TypeRule(SMBMapJoinOperator.class), new SemanticNodeProcessor() {
@Override
public Object process(Node currNode, Stack<Node> stack, NodeProcessorCtx procCtx, Object... os) throws SemanticException {
GenSparkProcContext context = (GenSparkProcContext) procCtx;
SMBMapJoinOperator currSmbNode = (SMBMapJoinOperator) currNode;
SparkSMBMapJoinInfo smbMapJoinCtx = context.smbMapJoinCtxMap.get(currSmbNode);
if (smbMapJoinCtx == null) {
smbMapJoinCtx = new SparkSMBMapJoinInfo();
context.smbMapJoinCtxMap.put(currSmbNode, smbMapJoinCtx);
}
for (Node stackNode : stack) {
if (stackNode instanceof DummyStoreOperator) {
// If coming from small-table side, do some book-keeping, and skip traversal.
smbMapJoinCtx.smallTableRootOps.add(context.currentRootOperator);
return true;
}
}
// If coming from big-table side, do some book-keeping, and continue traversal
smbMapJoinCtx.bigTableRootOp = context.currentRootOperator;
return false;
}
});
// The dispatcher fires the processor corresponding to the closest matching
// rule and passes the context along
SemanticDispatcher disp = new DefaultRuleDispatcher(null, opRules, procCtx);
SemanticGraphWalker ogw = new GenSparkWorkWalker(disp, procCtx);
ogw.startWalking(topNodes, null);
}
use of org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator in project hive by apache.
the class GenSparkWork method process.
@Override
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procContext, Object... nodeOutputs) throws SemanticException {
GenSparkProcContext context = (GenSparkProcContext) procContext;
Preconditions.checkArgument(context != null, "AssertionError: expected context to be not null");
Preconditions.checkArgument(context.currentTask != null, "AssertionError: expected context.currentTask to be not null");
Preconditions.checkArgument(context.currentRootOperator != null, "AssertionError: expected context.currentRootOperator to be not null");
// Operator is a file sink or reduce sink. Something that forces a new vertex.
@SuppressWarnings("unchecked") Operator<? extends OperatorDesc> operator = (Operator<? extends OperatorDesc>) nd;
// root is the start of the operator pipeline we're currently
// packing into a vertex, typically a table scan, union or join
Operator<?> root = context.currentRootOperator;
LOG.debug("Root operator: " + root);
LOG.debug("Leaf operator: " + operator);
SparkWork sparkWork = context.currentTask.getWork();
SMBMapJoinOperator smbOp = GenSparkUtils.getChildOperator(root, SMBMapJoinOperator.class);
// Right now the work graph is pretty simple. If there is no
// Preceding work we have a root and will generate a map
// vertex. If there is a preceding work we will generate
// a reduce vertex
BaseWork work;
if (context.rootToWorkMap.containsKey(root)) {
// having seen the root operator before means there was a branch in the
// operator graph. There's typically two reasons for that: a) mux/demux
// b) multi insert. Mux/Demux will hit the same leaf again, multi insert
// will result into a vertex with multiple FS or RS operators.
// At this point we don't have to do anything special in this case. Just
// run through the regular paces w/o creating a new task.
work = context.rootToWorkMap.get(root);
} else {
// create a new vertex
if (context.preceedingWork == null) {
if (smbOp == null) {
work = utils.createMapWork(context, root, sparkWork, null);
} else {
// save work to be initialized later with SMB information.
work = utils.createMapWork(context, root, sparkWork, null, true);
context.smbMapJoinCtxMap.get(smbOp).mapWork = (MapWork) work;
}
} else {
work = utils.createReduceWork(context, root, sparkWork);
}
context.rootToWorkMap.put(root, work);
}
if (!context.childToWorkMap.containsKey(operator)) {
List<BaseWork> workItems = new LinkedList<BaseWork>();
workItems.add(work);
context.childToWorkMap.put(operator, workItems);
} else {
context.childToWorkMap.get(operator).add(work);
}
// remember which mapjoin operator links with which work
if (!context.currentMapJoinOperators.isEmpty()) {
for (MapJoinOperator mj : context.currentMapJoinOperators) {
LOG.debug("Processing map join: " + mj);
// remember the mapping in case we scan another branch of the mapjoin later
if (!context.mapJoinWorkMap.containsKey(mj)) {
List<BaseWork> workItems = new LinkedList<BaseWork>();
workItems.add(work);
context.mapJoinWorkMap.put(mj, workItems);
} else {
context.mapJoinWorkMap.get(mj).add(work);
}
/*
* this happens in case of map join operations.
* The tree looks like this:
*
* RS <--- we are here perhaps
* |
* MapJoin
* / \
* RS TS
* /
* TS
*
* If we are at the RS pointed above, and we may have already visited the
* RS following the TS, we have already generated work for the TS-RS.
* We need to hook the current work to this generated work.
*/
if (context.linkOpWithWorkMap.containsKey(mj)) {
Map<BaseWork, SparkEdgeProperty> linkWorkMap = context.linkOpWithWorkMap.get(mj);
if (linkWorkMap != null) {
if (context.linkChildOpWithDummyOp.containsKey(mj)) {
for (Operator<?> dummy : context.linkChildOpWithDummyOp.get(mj)) {
work.addDummyOp((HashTableDummyOperator) dummy);
}
}
for (Entry<BaseWork, SparkEdgeProperty> parentWorkMap : linkWorkMap.entrySet()) {
BaseWork parentWork = parentWorkMap.getKey();
LOG.debug("connecting " + parentWork.getName() + " with " + work.getName());
SparkEdgeProperty edgeProp = parentWorkMap.getValue();
sparkWork.connect(parentWork, work, edgeProp);
// of the downstream work
for (ReduceSinkOperator r : context.linkWorkWithReduceSinkMap.get(parentWork)) {
if (r.getConf().getOutputName() != null) {
LOG.debug("Cloning reduce sink for multi-child broadcast edge");
// we've already set this one up. Need to clone for the next work.
r = (ReduceSinkOperator) OperatorFactory.getAndMakeChild(r.getCompilationOpContext(), (ReduceSinkDesc) r.getConf().clone(), r.getParentOperators());
}
r.getConf().setOutputName(work.getName());
}
}
}
}
}
// clear out the set. we don't need it anymore.
context.currentMapJoinOperators.clear();
}
// with this root operator.
if (root.getNumParent() > 0) {
Preconditions.checkArgument(work instanceof ReduceWork, "AssertionError: expected work to be a ReduceWork, but was " + work.getClass().getName());
ReduceWork reduceWork = (ReduceWork) work;
for (Operator<?> parent : new ArrayList<Operator<?>>(root.getParentOperators())) {
Preconditions.checkArgument(parent instanceof ReduceSinkOperator, "AssertionError: expected operator to be a ReduceSinkOperator, but was " + parent.getClass().getName());
ReduceSinkOperator rsOp = (ReduceSinkOperator) parent;
SparkEdgeProperty edgeProp = GenSparkUtils.getEdgeProperty(context.conf, rsOp, reduceWork);
rsOp.getConf().setOutputName(reduceWork.getName());
GenMapRedUtils.setKeyAndValueDesc(reduceWork, rsOp);
context.leafOpToFollowingWorkInfo.put(rsOp, Pair.of(edgeProp, reduceWork));
LOG.debug("Removing " + parent + " as parent from " + root);
root.removeParent(parent);
}
}
// the union operators from the operator tree later.
if (!context.currentUnionOperators.isEmpty()) {
context.currentUnionOperators.clear();
context.workWithUnionOperators.add(work);
}
// reasons. Roots are data sources, leaves are data sinks. I know.
if (context.leafOpToFollowingWorkInfo.containsKey(operator)) {
Pair<SparkEdgeProperty, ReduceWork> childWorkInfo = context.leafOpToFollowingWorkInfo.get(operator);
SparkEdgeProperty edgeProp = childWorkInfo.getLeft();
ReduceWork childWork = childWorkInfo.getRight();
LOG.debug("Second pass. Leaf operator: " + operator + " has common downstream work:" + childWork);
// we don't want to connect them with the work associated with TS more than once.
if (sparkWork.getEdgeProperty(work, childWork) == null) {
sparkWork.connect(work, childWork, edgeProp);
} else {
LOG.debug("work " + work.getName() + " is already connected to " + childWork.getName() + " before");
}
} else {
LOG.debug("First pass. Leaf operator: " + operator);
}
// the next item will be a new root.
if (!operator.getChildOperators().isEmpty()) {
Preconditions.checkArgument(operator.getChildOperators().size() == 1, "AssertionError: expected operator.getChildOperators().size() to be 1, but was " + operator.getChildOperators().size());
context.parentOfRoot = operator;
context.currentRootOperator = operator.getChildOperators().get(0);
context.preceedingWork = work;
}
return null;
}
use of org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator in project hive by apache.
the class AbstractSMBJoinProc method convertJoinToSMBJoin.
// Convert the join operator to a sort-merge join operator
protected void convertJoinToSMBJoin(JoinOperator joinOp, SortBucketJoinProcCtx smbJoinContext) throws SemanticException {
MapJoinOperator mapJoinOp = convertJoinToBucketMapJoin(joinOp, smbJoinContext);
SMBMapJoinOperator smbMapJoinOp = convertBucketMapJoinToSMBJoin(mapJoinOp, smbJoinContext);
smbMapJoinOp.setConvertedAutomaticallySMBJoin(true);
}
Aggregations