use of org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator in project hive by apache.
the class SemanticAnalyzer method analyzeInternal.
void analyzeInternal(ASTNode ast, PlannerContextFactory pcf) throws SemanticException {
// 1. Generate Resolved Parse tree from syntax tree
LOG.info("Starting Semantic Analysis");
// change the location of position alias process here
processPositionAlias(ast);
PlannerContext plannerCtx = pcf.create();
if (!genResolvedParseTree(ast, plannerCtx)) {
return;
}
if (HiveConf.getBoolVar(conf, ConfVars.HIVE_REMOVE_ORDERBY_IN_SUBQUERY)) {
for (String alias : qb.getSubqAliases()) {
removeOBInSubQuery(qb.getSubqForAlias(alias));
}
}
// Check query results cache.
// If no masking/filtering required, then we can check the cache now, before
// generating the operator tree and going through CBO.
// Otherwise we have to wait until after the masking/filtering step.
boolean isCacheEnabled = conf.getBoolVar(HiveConf.ConfVars.HIVE_QUERY_RESULTS_CACHE_ENABLED);
QueryResultsCache.LookupInfo lookupInfo = null;
boolean needsTransform = needsTransform();
if (isCacheEnabled && !needsTransform && queryTypeCanUseCache()) {
lookupInfo = createLookupInfoForQuery(ast);
if (checkResultsCache(lookupInfo)) {
return;
}
}
// 2. Gen OP Tree from resolved Parse Tree
Operator sinkOp = genOPTree(ast, plannerCtx);
if (!unparseTranslator.isEnabled() && (tableMask.isEnabled() && analyzeRewrite == null)) {
// Here we rewrite the * and also the masking table
ASTNode tree = rewriteASTWithMaskAndFilter(tableMask, ast, ctx.getTokenRewriteStream(), ctx, db, tabNameToTabObject, ignoredTokens);
if (tree != ast) {
plannerCtx = pcf.create();
ctx.setSkipTableMasking(true);
init(true);
// change the location of position alias process here
processPositionAlias(tree);
genResolvedParseTree(tree, plannerCtx);
if (this instanceof CalcitePlanner) {
((CalcitePlanner) this).resetCalciteConfiguration();
}
sinkOp = genOPTree(tree, plannerCtx);
}
}
// here, after applying the masking/filtering rewrite rules to the AST.
if (isCacheEnabled && needsTransform && queryTypeCanUseCache()) {
lookupInfo = createLookupInfoForQuery(ast);
if (checkResultsCache(lookupInfo)) {
return;
}
}
// 3. Deduce Resultset Schema
if (createVwDesc != null && !this.ctx.isCboSucceeded()) {
resultSchema = convertRowSchemaToViewSchema(opParseCtx.get(sinkOp).getRowResolver());
} else {
// succeeds.
if (resultSchema == null) {
resultSchema = convertRowSchemaToResultSetSchema(opParseCtx.get(sinkOp).getRowResolver(), HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_RESULTSET_USE_UNIQUE_COLUMN_NAMES));
}
}
// 4. Generate Parse Context for Optimizer & Physical compiler
copyInfoToQueryProperties(queryProperties);
ParseContext pCtx = new ParseContext(queryState, opToPartPruner, opToPartList, topOps, new HashSet<JoinOperator>(joinContext.keySet()), new HashSet<SMBMapJoinOperator>(smbMapJoinContext.keySet()), loadTableWork, loadFileWork, columnStatsAutoGatherContexts, ctx, idToTableNameMap, destTableId, uCtx, listMapJoinOpsNoReducer, prunedPartitions, tabNameToTabObject, opToSamplePruner, globalLimitCtx, nameToSplitSample, inputs, rootTasks, opToPartToSkewedPruner, viewAliasToInput, reduceSinkOperatorsAddedByEnforceBucketingSorting, analyzeRewrite, tableDesc, createVwDesc, materializedViewUpdateDesc, queryProperties, viewProjectToTableSchema, acidFileSinks);
// Set the semijoin hints in parse context
pCtx.setSemiJoinHints(parseSemiJoinHint(getQB().getParseInfo().getHintList()));
// Set the mapjoin hint if it needs to be disabled.
pCtx.setDisableMapJoin(disableMapJoinWithHint(getQB().getParseInfo().getHintList()));
// 5. Take care of view creation
if (createVwDesc != null) {
if (ctx.getExplainAnalyze() == AnalyzeState.RUNNING) {
return;
}
if (!ctx.isCboSucceeded()) {
saveViewDefinition();
}
// validate the create view statement at this point, the createVwDesc gets
// all the information for semanticcheck
validateCreateView();
if (createVwDesc.isMaterialized()) {
createVwDesc.setTablesUsed(getTablesUsed(pCtx));
} else {
// Since we're only creating a view (not executing it), we don't need to
// optimize or translate the plan (and in fact, those procedures can
// interfere with the view creation). So skip the rest of this method.
ctx.setResDir(null);
ctx.setResFile(null);
try {
PlanUtils.addInputsForView(pCtx);
} catch (HiveException e) {
throw new SemanticException(e);
}
// Generate lineage info for create view statements
// if LineageLogger hook is configured.
// Add the transformation that computes the lineage information.
Set<String> postExecHooks = Sets.newHashSet(Splitter.on(",").trimResults().omitEmptyStrings().split(Strings.nullToEmpty(HiveConf.getVar(conf, HiveConf.ConfVars.POSTEXECHOOKS))));
if (postExecHooks.contains("org.apache.hadoop.hive.ql.hooks.PostExecutePrinter") || postExecHooks.contains("org.apache.hadoop.hive.ql.hooks.LineageLogger") || postExecHooks.contains("org.apache.atlas.hive.hook.HiveHook")) {
ArrayList<Transform> transformations = new ArrayList<Transform>();
transformations.add(new HiveOpConverterPostProc());
transformations.add(new Generator(postExecHooks));
for (Transform t : transformations) {
pCtx = t.transform(pCtx);
}
// we just use view name as location.
queryState.getLineageState().mapDirToOp(new Path(createVwDesc.getViewName()), sinkOp);
}
return;
}
}
// 6. Generate table access stats if required
if (HiveConf.getBoolVar(this.conf, HiveConf.ConfVars.HIVE_STATS_COLLECT_TABLEKEYS)) {
TableAccessAnalyzer tableAccessAnalyzer = new TableAccessAnalyzer(pCtx);
setTableAccessInfo(tableAccessAnalyzer.analyzeTableAccess());
}
// 7. Perform Logical optimization
if (LOG.isDebugEnabled()) {
LOG.debug("Before logical optimization\n" + Operator.toString(pCtx.getTopOps().values()));
}
Optimizer optm = new Optimizer();
optm.setPctx(pCtx);
optm.initialize(conf);
pCtx = optm.optimize();
if (pCtx.getColumnAccessInfo() != null) {
// set ColumnAccessInfo for view column authorization
setColumnAccessInfo(pCtx.getColumnAccessInfo());
}
if (LOG.isDebugEnabled()) {
LOG.debug("After logical optimization\n" + Operator.toString(pCtx.getTopOps().values()));
}
// 8. Generate column access stats if required - wait until column pruning
// takes place during optimization
boolean isColumnInfoNeedForAuth = SessionState.get().isAuthorizationModeV2() && HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_AUTHORIZATION_ENABLED);
if (isColumnInfoNeedForAuth || HiveConf.getBoolVar(this.conf, HiveConf.ConfVars.HIVE_STATS_COLLECT_SCANCOLS)) {
ColumnAccessAnalyzer columnAccessAnalyzer = new ColumnAccessAnalyzer(pCtx);
// view column access info is carried by this.getColumnAccessInfo().
setColumnAccessInfo(columnAccessAnalyzer.analyzeColumnAccess(this.getColumnAccessInfo()));
}
// TEZ..)
if (!ctx.getExplainLogical()) {
TaskCompiler compiler = TaskCompilerFactory.getCompiler(conf, pCtx);
compiler.init(queryState, console, db);
compiler.compile(pCtx, rootTasks, inputs, outputs);
fetchTask = pCtx.getFetchTask();
}
// find all Acid FileSinkOperatorS
QueryPlanPostProcessor qp = new QueryPlanPostProcessor(rootTasks, acidFileSinks, ctx.getExecutionId());
LOG.info("Completed plan generation");
// 10. put accessed columns to readEntity
if (HiveConf.getBoolVar(this.conf, HiveConf.ConfVars.HIVE_STATS_COLLECT_SCANCOLS)) {
putAccessedColumnsToReadEntity(inputs, columnAccessInfo);
}
if (isCacheEnabled && lookupInfo != null) {
if (queryCanBeCached()) {
QueryResultsCache.QueryInfo queryInfo = createCacheQueryInfoForQuery(lookupInfo);
// Specify that the results of this query can be cached.
setCacheUsage(new CacheUsage(CacheUsage.CacheStatus.CAN_CACHE_QUERY_RESULTS, queryInfo));
}
}
}
use of org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator in project hive by apache.
the class GenMapRedUtils method splitTasks.
@SuppressWarnings("nls")
private static /**
* Split two tasks by creating a temporary file between them.
*
* @param op reduce sink operator being processed
* @param parentTask the parent task
* @param childTask the child task
* @param opProcCtx context
*/
void splitTasks(ReduceSinkOperator op, Task<? extends Serializable> parentTask, Task<? extends Serializable> childTask, GenMRProcContext opProcCtx) throws SemanticException {
if (op.getNumParent() != 1) {
throw new IllegalStateException("Expecting operator " + op + " to have one parent. " + "But found multiple parents : " + op.getParentOperators());
}
ParseContext parseCtx = opProcCtx.getParseCtx();
parentTask.addDependentTask(childTask);
// Root Task cannot depend on any other task, therefore childTask cannot be
// a root Task
List<Task<? extends Serializable>> rootTasks = opProcCtx.getRootTasks();
if (rootTasks.contains(childTask)) {
rootTasks.remove(childTask);
}
// Generate the temporary file name
Context baseCtx = parseCtx.getContext();
Path taskTmpDir = baseCtx.getMRTmpPath();
Operator<? extends OperatorDesc> parent = op.getParentOperators().get(0);
TableDesc tt_desc = PlanUtils.getIntermediateFileTableDesc(PlanUtils.getFieldSchemasFromRowSchema(parent.getSchema(), "temporarycol"));
// Create the temporary file, its corresponding FileSinkOperaotr, and
// its corresponding TableScanOperator.
TableScanOperator tableScanOp = createTemporaryFile(parent, op, taskTmpDir, tt_desc, parseCtx);
Map<Operator<? extends OperatorDesc>, GenMapRedCtx> mapCurrCtx = opProcCtx.getMapCurrCtx();
mapCurrCtx.put(tableScanOp, new GenMapRedCtx(childTask, null));
String streamDesc = taskTmpDir.toUri().toString();
MapredWork cplan = (MapredWork) childTask.getWork();
if (needsTagging(cplan.getReduceWork())) {
Operator<? extends OperatorDesc> reducerOp = cplan.getReduceWork().getReducer();
String id = null;
if (reducerOp instanceof JoinOperator) {
if (parseCtx.getJoinOps().contains(reducerOp)) {
id = ((JoinOperator) reducerOp).getConf().getId();
}
} else if (reducerOp instanceof MapJoinOperator) {
if (parseCtx.getMapJoinOps().contains(reducerOp)) {
id = ((MapJoinOperator) reducerOp).getConf().getId();
}
} else if (reducerOp instanceof SMBMapJoinOperator) {
if (parseCtx.getSmbMapJoinOps().contains(reducerOp)) {
id = ((SMBMapJoinOperator) reducerOp).getConf().getId();
}
}
if (id != null) {
streamDesc = id + ":$INTNAME";
} else {
streamDesc = "$INTNAME";
}
String origStreamDesc = streamDesc;
int pos = 0;
while (cplan.getMapWork().getAliasToWork().get(streamDesc) != null) {
streamDesc = origStreamDesc.concat(String.valueOf(++pos));
}
// TODO: Allocate work to remove the temporary files and make that
// dependent on the redTask
cplan.getReduceWork().setNeedsTagging(true);
}
// Add the path to alias mapping
setTaskPlan(taskTmpDir, streamDesc, tableScanOp, cplan.getMapWork(), false, tt_desc);
opProcCtx.setCurrTopOp(null);
opProcCtx.setCurrAliasId(null);
opProcCtx.setCurrTask(childTask);
opProcCtx.addRootIfPossible(parentTask);
}
use of org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator in project hive by apache.
the class GenMapRedUtils method splitTasks.
@SuppressWarnings("nls")
private static /**
* Split two tasks by creating a temporary file between them.
*
* @param op reduce sink operator being processed
* @param parentTask the parent task
* @param childTask the child task
* @param opProcCtx context
*/
void splitTasks(ReduceSinkOperator op, Task<?> parentTask, Task<?> childTask, GenMRProcContext opProcCtx) throws SemanticException {
if (op.getNumParent() != 1) {
throw new IllegalStateException("Expecting operator " + op + " to have one parent. " + "But found multiple parents : " + op.getParentOperators());
}
ParseContext parseCtx = opProcCtx.getParseCtx();
parentTask.addDependentTask(childTask);
// Root Task cannot depend on any other task, therefore childTask cannot be
// a root Task
List<Task<?>> rootTasks = opProcCtx.getRootTasks();
if (rootTasks.contains(childTask)) {
rootTasks.remove(childTask);
}
// Generate the temporary file name
Context baseCtx = parseCtx.getContext();
Path taskTmpDir = baseCtx.getMRTmpPath();
Operator<? extends OperatorDesc> parent = op.getParentOperators().get(0);
TableDesc tt_desc = PlanUtils.getIntermediateFileTableDesc(PlanUtils.getFieldSchemasFromRowSchema(parent.getSchema(), "temporarycol"));
// Create the temporary file, its corresponding FileSinkOperaotr, and
// its corresponding TableScanOperator.
TableScanOperator tableScanOp = createTemporaryFile(parent, op, taskTmpDir, tt_desc, parseCtx);
Map<Operator<? extends OperatorDesc>, GenMapRedCtx> mapCurrCtx = opProcCtx.getMapCurrCtx();
mapCurrCtx.put(tableScanOp, new GenMapRedCtx(childTask, null));
String streamDesc = taskTmpDir.toUri().toString();
MapredWork cplan = (MapredWork) childTask.getWork();
if (needsTagging(cplan.getReduceWork())) {
Operator<? extends OperatorDesc> reducerOp = cplan.getReduceWork().getReducer();
String id = null;
if (reducerOp instanceof JoinOperator) {
if (parseCtx.getJoinOps().contains(reducerOp)) {
id = ((JoinOperator) reducerOp).getConf().getId();
}
} else if (reducerOp instanceof MapJoinOperator) {
if (parseCtx.getMapJoinOps().contains(reducerOp)) {
id = ((MapJoinOperator) reducerOp).getConf().getId();
}
} else if (reducerOp instanceof SMBMapJoinOperator) {
if (parseCtx.getSmbMapJoinOps().contains(reducerOp)) {
id = ((SMBMapJoinOperator) reducerOp).getConf().getId();
}
}
if (id != null) {
streamDesc = id + ":$INTNAME";
} else {
streamDesc = "$INTNAME";
}
String origStreamDesc = streamDesc;
int pos = 0;
while (cplan.getMapWork().getAliasToWork().get(streamDesc) != null) {
streamDesc = origStreamDesc.concat(String.valueOf(++pos));
}
// TODO: Allocate work to remove the temporary files and make that
// dependent on the redTask
cplan.getReduceWork().setNeedsTagging(true);
}
// Add the path to alias mapping
setTaskPlan(taskTmpDir, streamDesc, tableScanOp, cplan.getMapWork(), false, tt_desc);
opProcCtx.setCurrTopOp(null);
opProcCtx.setCurrAliasId(null);
opProcCtx.setCurrTask(childTask);
opProcCtx.addRootIfPossible(parentTask);
}
use of org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator in project hive by apache.
the class MapJoinProcessor method convertSMBJoinToMapJoin.
/**
* convert a sortmerge join to a a map-side join.
*
* @param smbJoinOp
* join operator
* @param bigTablePos
* position of the source to be read as part of map-reduce framework. All other sources
* are cached in memory
* @param noCheckOuterJoin
*/
public static MapJoinOperator convertSMBJoinToMapJoin(HiveConf hconf, SMBMapJoinOperator smbJoinOp, int bigTablePos, boolean noCheckOuterJoin) throws SemanticException {
// Create a new map join operator
SMBJoinDesc smbJoinDesc = smbJoinOp.getConf();
List<ExprNodeDesc> keyCols = smbJoinDesc.getKeys().get(Byte.valueOf((byte) 0));
TableDesc keyTableDesc = PlanUtils.getMapJoinKeyTableDesc(hconf, PlanUtils.getFieldSchemasFromColumnList(keyCols, MAPJOINKEY_FIELDPREFIX));
MapJoinDesc mapJoinDesc = new MapJoinDesc(smbJoinDesc.getKeys(), keyTableDesc, smbJoinDesc.getExprs(), smbJoinDesc.getValueTblDescs(), smbJoinDesc.getValueTblDescs(), smbJoinDesc.getOutputColumnNames(), bigTablePos, smbJoinDesc.getConds(), smbJoinDesc.getFilters(), smbJoinDesc.isNoOuterJoin(), smbJoinDesc.getDumpFilePrefix(), smbJoinDesc.getMemoryMonitorInfo(), smbJoinDesc.getInMemoryDataSize());
mapJoinDesc.setStatistics(smbJoinDesc.getStatistics());
mapJoinDesc.setColumnExprMap(smbJoinDesc.getColumnExprMap());
RowSchema joinRS = smbJoinOp.getSchema();
// The mapjoin has the same schema as the join operator
MapJoinOperator mapJoinOp = (MapJoinOperator) OperatorFactory.getAndMakeChild(smbJoinOp.getCompilationOpContext(), mapJoinDesc, joinRS, new ArrayList<Operator<? extends OperatorDesc>>());
// change the children of the original join operator to point to the map
// join operator
List<Operator<? extends OperatorDesc>> childOps = smbJoinOp.getChildOperators();
for (Operator<? extends OperatorDesc> childOp : childOps) {
childOp.replaceParent(smbJoinOp, mapJoinOp);
}
mapJoinOp.setChildOperators(childOps);
smbJoinOp.setChildOperators(null);
// change the parent of the original SMBjoin operator to point to the map
// join operator
List<Operator<? extends OperatorDesc>> parentOps = smbJoinOp.getParentOperators();
for (Operator<? extends OperatorDesc> parentOp : parentOps) {
parentOp.replaceChild(smbJoinOp, mapJoinOp);
}
mapJoinOp.setParentOperators(parentOps);
smbJoinOp.setParentOperators(null);
return mapJoinOp;
}
use of org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator in project hive by apache.
the class AbstractSMBJoinProc method convertBucketMapJoinToSMBJoin.
// Convert the bucket map-join operator to a sort-merge map join operator
protected SMBMapJoinOperator convertBucketMapJoinToSMBJoin(MapJoinOperator mapJoinOp, SortBucketJoinProcCtx smbJoinContext) {
String[] srcs = smbJoinContext.getSrcs();
SMBMapJoinOperator smbJop = new SMBMapJoinOperator(mapJoinOp);
SMBJoinDesc smbJoinDesc = new SMBJoinDesc(mapJoinOp.getConf());
smbJop.setConf(smbJoinDesc);
HashMap<Byte, String> tagToAlias = new HashMap<Byte, String>();
for (int i = 0; i < srcs.length; i++) {
tagToAlias.put((byte) i, srcs[i]);
}
smbJoinDesc.setTagToAlias(tagToAlias);
int indexInListMapJoinNoReducer = this.pGraphContext.getListMapJoinOpsNoReducer().indexOf(mapJoinOp);
if (indexInListMapJoinNoReducer >= 0) {
this.pGraphContext.getListMapJoinOpsNoReducer().remove(indexInListMapJoinNoReducer);
this.pGraphContext.getListMapJoinOpsNoReducer().add(indexInListMapJoinNoReducer, smbJop);
}
Map<String, DummyStoreOperator> aliasToSink = new HashMap<String, DummyStoreOperator>();
// For all parents (other than the big table), insert a dummy store operator
/* Consider a query like:
*
* select * from
* (subq1 --> has a filter)
* join
* (subq2 --> has a filter)
* on some key
*
* Let us assume that subq1 is the small table (either specified by the user or inferred
* automatically). The following operator tree will be created:
*
* TableScan (subq1) --> Select --> Filter --> DummyStore
* \
* \ SMBJoin
* /
* /
* TableScan (subq2) --> Select --> Filter
*/
List<Operator<? extends OperatorDesc>> parentOperators = mapJoinOp.getParentOperators();
for (int i = 0; i < parentOperators.size(); i++) {
Operator<? extends OperatorDesc> par = parentOperators.get(i);
int index = par.getChildOperators().indexOf(mapJoinOp);
par.getChildOperators().remove(index);
if (i == smbJoinDesc.getPosBigTable()) {
par.getChildOperators().add(index, smbJop);
} else {
DummyStoreOperator dummyStoreOp = (DummyStoreOperator) OperatorFactory.get(par.getCompilationOpContext(), new DummyStoreDesc());
par.getChildOperators().add(index, dummyStoreOp);
List<Operator<? extends OperatorDesc>> childrenOps = new ArrayList<Operator<? extends OperatorDesc>>();
childrenOps.add(smbJop);
dummyStoreOp.setChildOperators(childrenOps);
List<Operator<? extends OperatorDesc>> parentOps = new ArrayList<Operator<? extends OperatorDesc>>();
parentOps.add(par);
dummyStoreOp.setParentOperators(parentOps);
aliasToSink.put(srcs[i], dummyStoreOp);
smbJop.getParentOperators().remove(i);
smbJop.getParentOperators().add(i, dummyStoreOp);
}
}
smbJoinDesc.setAliasToSink(aliasToSink);
List<Operator<? extends OperatorDesc>> childOps = mapJoinOp.getChildOperators();
for (int i = 0; i < childOps.size(); i++) {
Operator<? extends OperatorDesc> child = childOps.get(i);
int index = child.getParentOperators().indexOf(mapJoinOp);
child.getParentOperators().remove(index);
child.getParentOperators().add(index, smbJop);
}
// Data structures coming from QBJoinTree
smbJop.getConf().setQBJoinTreeProps(mapJoinOp.getConf());
//
pGraphContext.getSmbMapJoinOps().add(smbJop);
pGraphContext.getMapJoinOps().remove(mapJoinOp);
return smbJop;
}
Aggregations