Search in sources :

Example 1 with SMBMapJoinOperator

use of org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator in project hive by apache.

the class SemanticAnalyzer method analyzeInternal.

void analyzeInternal(ASTNode ast, PlannerContextFactory pcf) throws SemanticException {
    // 1. Generate Resolved Parse tree from syntax tree
    LOG.info("Starting Semantic Analysis");
    // change the location of position alias process here
    processPositionAlias(ast);
    PlannerContext plannerCtx = pcf.create();
    if (!genResolvedParseTree(ast, plannerCtx)) {
        return;
    }
    if (HiveConf.getBoolVar(conf, ConfVars.HIVE_REMOVE_ORDERBY_IN_SUBQUERY)) {
        for (String alias : qb.getSubqAliases()) {
            removeOBInSubQuery(qb.getSubqForAlias(alias));
        }
    }
    // Check query results cache.
    // If no masking/filtering required, then we can check the cache now, before
    // generating the operator tree and going through CBO.
    // Otherwise we have to wait until after the masking/filtering step.
    boolean isCacheEnabled = conf.getBoolVar(HiveConf.ConfVars.HIVE_QUERY_RESULTS_CACHE_ENABLED);
    QueryResultsCache.LookupInfo lookupInfo = null;
    boolean needsTransform = needsTransform();
    if (isCacheEnabled && !needsTransform && queryTypeCanUseCache()) {
        lookupInfo = createLookupInfoForQuery(ast);
        if (checkResultsCache(lookupInfo)) {
            return;
        }
    }
    // 2. Gen OP Tree from resolved Parse Tree
    Operator sinkOp = genOPTree(ast, plannerCtx);
    if (!unparseTranslator.isEnabled() && (tableMask.isEnabled() && analyzeRewrite == null)) {
        // Here we rewrite the * and also the masking table
        ASTNode tree = rewriteASTWithMaskAndFilter(tableMask, ast, ctx.getTokenRewriteStream(), ctx, db, tabNameToTabObject, ignoredTokens);
        if (tree != ast) {
            plannerCtx = pcf.create();
            ctx.setSkipTableMasking(true);
            init(true);
            // change the location of position alias process here
            processPositionAlias(tree);
            genResolvedParseTree(tree, plannerCtx);
            if (this instanceof CalcitePlanner) {
                ((CalcitePlanner) this).resetCalciteConfiguration();
            }
            sinkOp = genOPTree(tree, plannerCtx);
        }
    }
    // here, after applying the masking/filtering rewrite rules to the AST.
    if (isCacheEnabled && needsTransform && queryTypeCanUseCache()) {
        lookupInfo = createLookupInfoForQuery(ast);
        if (checkResultsCache(lookupInfo)) {
            return;
        }
    }
    // 3. Deduce Resultset Schema
    if (createVwDesc != null && !this.ctx.isCboSucceeded()) {
        resultSchema = convertRowSchemaToViewSchema(opParseCtx.get(sinkOp).getRowResolver());
    } else {
        // succeeds.
        if (resultSchema == null) {
            resultSchema = convertRowSchemaToResultSetSchema(opParseCtx.get(sinkOp).getRowResolver(), HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_RESULTSET_USE_UNIQUE_COLUMN_NAMES));
        }
    }
    // 4. Generate Parse Context for Optimizer & Physical compiler
    copyInfoToQueryProperties(queryProperties);
    ParseContext pCtx = new ParseContext(queryState, opToPartPruner, opToPartList, topOps, new HashSet<JoinOperator>(joinContext.keySet()), new HashSet<SMBMapJoinOperator>(smbMapJoinContext.keySet()), loadTableWork, loadFileWork, columnStatsAutoGatherContexts, ctx, idToTableNameMap, destTableId, uCtx, listMapJoinOpsNoReducer, prunedPartitions, tabNameToTabObject, opToSamplePruner, globalLimitCtx, nameToSplitSample, inputs, rootTasks, opToPartToSkewedPruner, viewAliasToInput, reduceSinkOperatorsAddedByEnforceBucketingSorting, analyzeRewrite, tableDesc, createVwDesc, materializedViewUpdateDesc, queryProperties, viewProjectToTableSchema, acidFileSinks);
    // Set the semijoin hints in parse context
    pCtx.setSemiJoinHints(parseSemiJoinHint(getQB().getParseInfo().getHintList()));
    // Set the mapjoin hint if it needs to be disabled.
    pCtx.setDisableMapJoin(disableMapJoinWithHint(getQB().getParseInfo().getHintList()));
    // 5. Take care of view creation
    if (createVwDesc != null) {
        if (ctx.getExplainAnalyze() == AnalyzeState.RUNNING) {
            return;
        }
        if (!ctx.isCboSucceeded()) {
            saveViewDefinition();
        }
        // validate the create view statement at this point, the createVwDesc gets
        // all the information for semanticcheck
        validateCreateView();
        if (createVwDesc.isMaterialized()) {
            createVwDesc.setTablesUsed(getTablesUsed(pCtx));
        } else {
            // Since we're only creating a view (not executing it), we don't need to
            // optimize or translate the plan (and in fact, those procedures can
            // interfere with the view creation). So skip the rest of this method.
            ctx.setResDir(null);
            ctx.setResFile(null);
            try {
                PlanUtils.addInputsForView(pCtx);
            } catch (HiveException e) {
                throw new SemanticException(e);
            }
            // Generate lineage info for create view statements
            // if LineageLogger hook is configured.
            // Add the transformation that computes the lineage information.
            Set<String> postExecHooks = Sets.newHashSet(Splitter.on(",").trimResults().omitEmptyStrings().split(Strings.nullToEmpty(HiveConf.getVar(conf, HiveConf.ConfVars.POSTEXECHOOKS))));
            if (postExecHooks.contains("org.apache.hadoop.hive.ql.hooks.PostExecutePrinter") || postExecHooks.contains("org.apache.hadoop.hive.ql.hooks.LineageLogger") || postExecHooks.contains("org.apache.atlas.hive.hook.HiveHook")) {
                ArrayList<Transform> transformations = new ArrayList<Transform>();
                transformations.add(new HiveOpConverterPostProc());
                transformations.add(new Generator(postExecHooks));
                for (Transform t : transformations) {
                    pCtx = t.transform(pCtx);
                }
                // we just use view name as location.
                queryState.getLineageState().mapDirToOp(new Path(createVwDesc.getViewName()), sinkOp);
            }
            return;
        }
    }
    // 6. Generate table access stats if required
    if (HiveConf.getBoolVar(this.conf, HiveConf.ConfVars.HIVE_STATS_COLLECT_TABLEKEYS)) {
        TableAccessAnalyzer tableAccessAnalyzer = new TableAccessAnalyzer(pCtx);
        setTableAccessInfo(tableAccessAnalyzer.analyzeTableAccess());
    }
    // 7. Perform Logical optimization
    if (LOG.isDebugEnabled()) {
        LOG.debug("Before logical optimization\n" + Operator.toString(pCtx.getTopOps().values()));
    }
    Optimizer optm = new Optimizer();
    optm.setPctx(pCtx);
    optm.initialize(conf);
    pCtx = optm.optimize();
    if (pCtx.getColumnAccessInfo() != null) {
        // set ColumnAccessInfo for view column authorization
        setColumnAccessInfo(pCtx.getColumnAccessInfo());
    }
    if (LOG.isDebugEnabled()) {
        LOG.debug("After logical optimization\n" + Operator.toString(pCtx.getTopOps().values()));
    }
    // 8. Generate column access stats if required - wait until column pruning
    // takes place during optimization
    boolean isColumnInfoNeedForAuth = SessionState.get().isAuthorizationModeV2() && HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_AUTHORIZATION_ENABLED);
    if (isColumnInfoNeedForAuth || HiveConf.getBoolVar(this.conf, HiveConf.ConfVars.HIVE_STATS_COLLECT_SCANCOLS)) {
        ColumnAccessAnalyzer columnAccessAnalyzer = new ColumnAccessAnalyzer(pCtx);
        // view column access info is carried by this.getColumnAccessInfo().
        setColumnAccessInfo(columnAccessAnalyzer.analyzeColumnAccess(this.getColumnAccessInfo()));
    }
    // TEZ..)
    if (!ctx.getExplainLogical()) {
        TaskCompiler compiler = TaskCompilerFactory.getCompiler(conf, pCtx);
        compiler.init(queryState, console, db);
        compiler.compile(pCtx, rootTasks, inputs, outputs);
        fetchTask = pCtx.getFetchTask();
    }
    // find all Acid FileSinkOperatorS
    QueryPlanPostProcessor qp = new QueryPlanPostProcessor(rootTasks, acidFileSinks, ctx.getExecutionId());
    LOG.info("Completed plan generation");
    // 10. put accessed columns to readEntity
    if (HiveConf.getBoolVar(this.conf, HiveConf.ConfVars.HIVE_STATS_COLLECT_SCANCOLS)) {
        putAccessedColumnsToReadEntity(inputs, columnAccessInfo);
    }
    if (isCacheEnabled && lookupInfo != null) {
        if (queryCanBeCached()) {
            QueryResultsCache.QueryInfo queryInfo = createCacheQueryInfoForQuery(lookupInfo);
            // Specify that the results of this query can be cached.
            setCacheUsage(new CacheUsage(CacheUsage.CacheStatus.CAN_CACHE_QUERY_RESULTS, queryInfo));
        }
    }
}
Also used : AbstractMapJoinOperator(org.apache.hadoop.hive.ql.exec.AbstractMapJoinOperator) SelectOperator(org.apache.hadoop.hive.ql.exec.SelectOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) GroupByOperator(org.apache.hadoop.hive.ql.exec.GroupByOperator) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) FilterOperator(org.apache.hadoop.hive.ql.exec.FilterOperator) LimitOperator(org.apache.hadoop.hive.ql.exec.LimitOperator) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) AbstractMapJoinOperator(org.apache.hadoop.hive.ql.exec.AbstractMapJoinOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) QueryPlanPostProcessor(org.apache.hadoop.hive.ql.optimizer.QueryPlanPostProcessor) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) Optimizer(org.apache.hadoop.hive.ql.optimizer.Optimizer) ArrayList(java.util.ArrayList) QueryResultsCache(org.apache.hadoop.hive.ql.cache.results.QueryResultsCache) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) HiveOpConverterPostProc(org.apache.hadoop.hive.ql.optimizer.calcite.translator.HiveOpConverterPostProc) CalciteSemanticException(org.apache.hadoop.hive.ql.optimizer.calcite.CalciteSemanticException) CacheUsage(org.apache.hadoop.hive.ql.cache.results.CacheUsage) Path(org.apache.hadoop.fs.Path) Transform(org.apache.hadoop.hive.ql.optimizer.Transform) Generator(org.apache.hadoop.hive.ql.optimizer.lineage.Generator)

Example 2 with SMBMapJoinOperator

use of org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator in project hive by apache.

the class GenMapRedUtils method splitTasks.

@SuppressWarnings("nls")
private static /**
 * Split two tasks by creating a temporary file between them.
 *
 * @param op reduce sink operator being processed
 * @param parentTask the parent task
 * @param childTask the child task
 * @param opProcCtx context
 */
void splitTasks(ReduceSinkOperator op, Task<? extends Serializable> parentTask, Task<? extends Serializable> childTask, GenMRProcContext opProcCtx) throws SemanticException {
    if (op.getNumParent() != 1) {
        throw new IllegalStateException("Expecting operator " + op + " to have one parent. " + "But found multiple parents : " + op.getParentOperators());
    }
    ParseContext parseCtx = opProcCtx.getParseCtx();
    parentTask.addDependentTask(childTask);
    // Root Task cannot depend on any other task, therefore childTask cannot be
    // a root Task
    List<Task<? extends Serializable>> rootTasks = opProcCtx.getRootTasks();
    if (rootTasks.contains(childTask)) {
        rootTasks.remove(childTask);
    }
    // Generate the temporary file name
    Context baseCtx = parseCtx.getContext();
    Path taskTmpDir = baseCtx.getMRTmpPath();
    Operator<? extends OperatorDesc> parent = op.getParentOperators().get(0);
    TableDesc tt_desc = PlanUtils.getIntermediateFileTableDesc(PlanUtils.getFieldSchemasFromRowSchema(parent.getSchema(), "temporarycol"));
    // Create the temporary file, its corresponding FileSinkOperaotr, and
    // its corresponding TableScanOperator.
    TableScanOperator tableScanOp = createTemporaryFile(parent, op, taskTmpDir, tt_desc, parseCtx);
    Map<Operator<? extends OperatorDesc>, GenMapRedCtx> mapCurrCtx = opProcCtx.getMapCurrCtx();
    mapCurrCtx.put(tableScanOp, new GenMapRedCtx(childTask, null));
    String streamDesc = taskTmpDir.toUri().toString();
    MapredWork cplan = (MapredWork) childTask.getWork();
    if (needsTagging(cplan.getReduceWork())) {
        Operator<? extends OperatorDesc> reducerOp = cplan.getReduceWork().getReducer();
        String id = null;
        if (reducerOp instanceof JoinOperator) {
            if (parseCtx.getJoinOps().contains(reducerOp)) {
                id = ((JoinOperator) reducerOp).getConf().getId();
            }
        } else if (reducerOp instanceof MapJoinOperator) {
            if (parseCtx.getMapJoinOps().contains(reducerOp)) {
                id = ((MapJoinOperator) reducerOp).getConf().getId();
            }
        } else if (reducerOp instanceof SMBMapJoinOperator) {
            if (parseCtx.getSmbMapJoinOps().contains(reducerOp)) {
                id = ((SMBMapJoinOperator) reducerOp).getConf().getId();
            }
        }
        if (id != null) {
            streamDesc = id + ":$INTNAME";
        } else {
            streamDesc = "$INTNAME";
        }
        String origStreamDesc = streamDesc;
        int pos = 0;
        while (cplan.getMapWork().getAliasToWork().get(streamDesc) != null) {
            streamDesc = origStreamDesc.concat(String.valueOf(++pos));
        }
        // TODO: Allocate work to remove the temporary files and make that
        // dependent on the redTask
        cplan.getReduceWork().setNeedsTagging(true);
    }
    // Add the path to alias mapping
    setTaskPlan(taskTmpDir, streamDesc, tableScanOp, cplan.getMapWork(), false, tt_desc);
    opProcCtx.setCurrTopOp(null);
    opProcCtx.setCurrAliasId(null);
    opProcCtx.setCurrTask(childTask);
    opProcCtx.addRootIfPossible(parentTask);
}
Also used : ParseContext(org.apache.hadoop.hive.ql.parse.ParseContext) Context(org.apache.hadoop.hive.ql.Context) CompilationOpContext(org.apache.hadoop.hive.ql.CompilationOpContext) Path(org.apache.hadoop.fs.Path) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) DemuxOperator(org.apache.hadoop.hive.ql.exec.DemuxOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) SparkTask(org.apache.hadoop.hive.ql.exec.spark.SparkTask) ConditionalTask(org.apache.hadoop.hive.ql.exec.ConditionalTask) Task(org.apache.hadoop.hive.ql.exec.Task) MoveTask(org.apache.hadoop.hive.ql.exec.MoveTask) MapRedTask(org.apache.hadoop.hive.ql.exec.mr.MapRedTask) DependencyCollectionTask(org.apache.hadoop.hive.ql.exec.DependencyCollectionTask) Serializable(java.io.Serializable) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) MapredWork(org.apache.hadoop.hive.ql.plan.MapredWork) ParseContext(org.apache.hadoop.hive.ql.parse.ParseContext) GenMapRedCtx(org.apache.hadoop.hive.ql.optimizer.GenMRProcContext.GenMapRedCtx) LoadTableDesc(org.apache.hadoop.hive.ql.plan.LoadTableDesc) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc)

Example 3 with SMBMapJoinOperator

use of org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator in project hive by apache.

the class GenMapRedUtils method splitTasks.

@SuppressWarnings("nls")
private static /**
 * Split two tasks by creating a temporary file between them.
 *
 * @param op reduce sink operator being processed
 * @param parentTask the parent task
 * @param childTask the child task
 * @param opProcCtx context
 */
void splitTasks(ReduceSinkOperator op, Task<?> parentTask, Task<?> childTask, GenMRProcContext opProcCtx) throws SemanticException {
    if (op.getNumParent() != 1) {
        throw new IllegalStateException("Expecting operator " + op + " to have one parent. " + "But found multiple parents : " + op.getParentOperators());
    }
    ParseContext parseCtx = opProcCtx.getParseCtx();
    parentTask.addDependentTask(childTask);
    // Root Task cannot depend on any other task, therefore childTask cannot be
    // a root Task
    List<Task<?>> rootTasks = opProcCtx.getRootTasks();
    if (rootTasks.contains(childTask)) {
        rootTasks.remove(childTask);
    }
    // Generate the temporary file name
    Context baseCtx = parseCtx.getContext();
    Path taskTmpDir = baseCtx.getMRTmpPath();
    Operator<? extends OperatorDesc> parent = op.getParentOperators().get(0);
    TableDesc tt_desc = PlanUtils.getIntermediateFileTableDesc(PlanUtils.getFieldSchemasFromRowSchema(parent.getSchema(), "temporarycol"));
    // Create the temporary file, its corresponding FileSinkOperaotr, and
    // its corresponding TableScanOperator.
    TableScanOperator tableScanOp = createTemporaryFile(parent, op, taskTmpDir, tt_desc, parseCtx);
    Map<Operator<? extends OperatorDesc>, GenMapRedCtx> mapCurrCtx = opProcCtx.getMapCurrCtx();
    mapCurrCtx.put(tableScanOp, new GenMapRedCtx(childTask, null));
    String streamDesc = taskTmpDir.toUri().toString();
    MapredWork cplan = (MapredWork) childTask.getWork();
    if (needsTagging(cplan.getReduceWork())) {
        Operator<? extends OperatorDesc> reducerOp = cplan.getReduceWork().getReducer();
        String id = null;
        if (reducerOp instanceof JoinOperator) {
            if (parseCtx.getJoinOps().contains(reducerOp)) {
                id = ((JoinOperator) reducerOp).getConf().getId();
            }
        } else if (reducerOp instanceof MapJoinOperator) {
            if (parseCtx.getMapJoinOps().contains(reducerOp)) {
                id = ((MapJoinOperator) reducerOp).getConf().getId();
            }
        } else if (reducerOp instanceof SMBMapJoinOperator) {
            if (parseCtx.getSmbMapJoinOps().contains(reducerOp)) {
                id = ((SMBMapJoinOperator) reducerOp).getConf().getId();
            }
        }
        if (id != null) {
            streamDesc = id + ":$INTNAME";
        } else {
            streamDesc = "$INTNAME";
        }
        String origStreamDesc = streamDesc;
        int pos = 0;
        while (cplan.getMapWork().getAliasToWork().get(streamDesc) != null) {
            streamDesc = origStreamDesc.concat(String.valueOf(++pos));
        }
        // TODO: Allocate work to remove the temporary files and make that
        // dependent on the redTask
        cplan.getReduceWork().setNeedsTagging(true);
    }
    // Add the path to alias mapping
    setTaskPlan(taskTmpDir, streamDesc, tableScanOp, cplan.getMapWork(), false, tt_desc);
    opProcCtx.setCurrTopOp(null);
    opProcCtx.setCurrAliasId(null);
    opProcCtx.setCurrTask(childTask);
    opProcCtx.addRootIfPossible(parentTask);
}
Also used : ParseContext(org.apache.hadoop.hive.ql.parse.ParseContext) Context(org.apache.hadoop.hive.ql.Context) CompilationOpContext(org.apache.hadoop.hive.ql.CompilationOpContext) Path(org.apache.hadoop.fs.Path) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) DemuxOperator(org.apache.hadoop.hive.ql.exec.DemuxOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) SparkTask(org.apache.hadoop.hive.ql.exec.spark.SparkTask) ConditionalTask(org.apache.hadoop.hive.ql.exec.ConditionalTask) TezTask(org.apache.hadoop.hive.ql.exec.tez.TezTask) Task(org.apache.hadoop.hive.ql.exec.Task) MoveTask(org.apache.hadoop.hive.ql.exec.MoveTask) MapRedTask(org.apache.hadoop.hive.ql.exec.mr.MapRedTask) DependencyCollectionTask(org.apache.hadoop.hive.ql.exec.DependencyCollectionTask) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) MapredWork(org.apache.hadoop.hive.ql.plan.MapredWork) ParseContext(org.apache.hadoop.hive.ql.parse.ParseContext) GenMapRedCtx(org.apache.hadoop.hive.ql.optimizer.GenMRProcContext.GenMapRedCtx) LoadTableDesc(org.apache.hadoop.hive.ql.plan.LoadTableDesc) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc)

Example 4 with SMBMapJoinOperator

use of org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator in project hive by apache.

the class MapJoinProcessor method convertSMBJoinToMapJoin.

/**
 * convert a sortmerge join to a a map-side join.
 *
 * @param smbJoinOp
 *          join operator
 * @param bigTablePos
 *          position of the source to be read as part of map-reduce framework. All other sources
 *          are cached in memory
 * @param noCheckOuterJoin
 */
public static MapJoinOperator convertSMBJoinToMapJoin(HiveConf hconf, SMBMapJoinOperator smbJoinOp, int bigTablePos, boolean noCheckOuterJoin) throws SemanticException {
    // Create a new map join operator
    SMBJoinDesc smbJoinDesc = smbJoinOp.getConf();
    List<ExprNodeDesc> keyCols = smbJoinDesc.getKeys().get(Byte.valueOf((byte) 0));
    TableDesc keyTableDesc = PlanUtils.getMapJoinKeyTableDesc(hconf, PlanUtils.getFieldSchemasFromColumnList(keyCols, MAPJOINKEY_FIELDPREFIX));
    MapJoinDesc mapJoinDesc = new MapJoinDesc(smbJoinDesc.getKeys(), keyTableDesc, smbJoinDesc.getExprs(), smbJoinDesc.getValueTblDescs(), smbJoinDesc.getValueTblDescs(), smbJoinDesc.getOutputColumnNames(), bigTablePos, smbJoinDesc.getConds(), smbJoinDesc.getFilters(), smbJoinDesc.isNoOuterJoin(), smbJoinDesc.getDumpFilePrefix(), smbJoinDesc.getMemoryMonitorInfo(), smbJoinDesc.getInMemoryDataSize());
    mapJoinDesc.setStatistics(smbJoinDesc.getStatistics());
    mapJoinDesc.setColumnExprMap(smbJoinDesc.getColumnExprMap());
    RowSchema joinRS = smbJoinOp.getSchema();
    // The mapjoin has the same schema as the join operator
    MapJoinOperator mapJoinOp = (MapJoinOperator) OperatorFactory.getAndMakeChild(smbJoinOp.getCompilationOpContext(), mapJoinDesc, joinRS, new ArrayList<Operator<? extends OperatorDesc>>());
    // change the children of the original join operator to point to the map
    // join operator
    List<Operator<? extends OperatorDesc>> childOps = smbJoinOp.getChildOperators();
    for (Operator<? extends OperatorDesc> childOp : childOps) {
        childOp.replaceParent(smbJoinOp, mapJoinOp);
    }
    mapJoinOp.setChildOperators(childOps);
    smbJoinOp.setChildOperators(null);
    // change the parent of the original SMBjoin operator to point to the map
    // join operator
    List<Operator<? extends OperatorDesc>> parentOps = smbJoinOp.getParentOperators();
    for (Operator<? extends OperatorDesc> parentOp : parentOps) {
        parentOp.replaceChild(smbJoinOp, mapJoinOp);
    }
    mapJoinOp.setParentOperators(parentOps);
    smbJoinOp.setParentOperators(null);
    return mapJoinOp;
}
Also used : MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) AbstractMapJoinOperator(org.apache.hadoop.hive.ql.exec.AbstractMapJoinOperator) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) LateralViewJoinOperator(org.apache.hadoop.hive.ql.exec.LateralViewJoinOperator) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) GroupByOperator(org.apache.hadoop.hive.ql.exec.GroupByOperator) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) AbstractMapJoinOperator(org.apache.hadoop.hive.ql.exec.AbstractMapJoinOperator) SelectOperator(org.apache.hadoop.hive.ql.exec.SelectOperator) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) ScriptOperator(org.apache.hadoop.hive.ql.exec.ScriptOperator) RowSchema(org.apache.hadoop.hive.ql.exec.RowSchema) MapJoinDesc(org.apache.hadoop.hive.ql.plan.MapJoinDesc) SMBJoinDesc(org.apache.hadoop.hive.ql.plan.SMBJoinDesc) ArrayList(java.util.ArrayList) ExprNodeDesc(org.apache.hadoop.hive.ql.plan.ExprNodeDesc) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc)

Example 5 with SMBMapJoinOperator

use of org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator in project hive by apache.

the class AbstractSMBJoinProc method convertBucketMapJoinToSMBJoin.

// Convert the bucket map-join operator to a sort-merge map join operator
protected SMBMapJoinOperator convertBucketMapJoinToSMBJoin(MapJoinOperator mapJoinOp, SortBucketJoinProcCtx smbJoinContext) {
    String[] srcs = smbJoinContext.getSrcs();
    SMBMapJoinOperator smbJop = new SMBMapJoinOperator(mapJoinOp);
    SMBJoinDesc smbJoinDesc = new SMBJoinDesc(mapJoinOp.getConf());
    smbJop.setConf(smbJoinDesc);
    HashMap<Byte, String> tagToAlias = new HashMap<Byte, String>();
    for (int i = 0; i < srcs.length; i++) {
        tagToAlias.put((byte) i, srcs[i]);
    }
    smbJoinDesc.setTagToAlias(tagToAlias);
    int indexInListMapJoinNoReducer = this.pGraphContext.getListMapJoinOpsNoReducer().indexOf(mapJoinOp);
    if (indexInListMapJoinNoReducer >= 0) {
        this.pGraphContext.getListMapJoinOpsNoReducer().remove(indexInListMapJoinNoReducer);
        this.pGraphContext.getListMapJoinOpsNoReducer().add(indexInListMapJoinNoReducer, smbJop);
    }
    Map<String, DummyStoreOperator> aliasToSink = new HashMap<String, DummyStoreOperator>();
    // For all parents (other than the big table), insert a dummy store operator
    /* Consider a query like:
     *
     * select * from
     *   (subq1 --> has a filter)
     *   join
     *   (subq2 --> has a filter)
     * on some key
     *
     * Let us assume that subq1 is the small table (either specified by the user or inferred
     * automatically). The following operator tree will be created:
     *
     * TableScan (subq1) --> Select --> Filter --> DummyStore
     *                                                         \
     *                                                          \     SMBJoin
     *                                                          /
     *                                                         /
     * TableScan (subq2) --> Select --> Filter
     */
    List<Operator<? extends OperatorDesc>> parentOperators = mapJoinOp.getParentOperators();
    for (int i = 0; i < parentOperators.size(); i++) {
        Operator<? extends OperatorDesc> par = parentOperators.get(i);
        int index = par.getChildOperators().indexOf(mapJoinOp);
        par.getChildOperators().remove(index);
        if (i == smbJoinDesc.getPosBigTable()) {
            par.getChildOperators().add(index, smbJop);
        } else {
            DummyStoreOperator dummyStoreOp = (DummyStoreOperator) OperatorFactory.get(par.getCompilationOpContext(), new DummyStoreDesc());
            par.getChildOperators().add(index, dummyStoreOp);
            List<Operator<? extends OperatorDesc>> childrenOps = new ArrayList<Operator<? extends OperatorDesc>>();
            childrenOps.add(smbJop);
            dummyStoreOp.setChildOperators(childrenOps);
            List<Operator<? extends OperatorDesc>> parentOps = new ArrayList<Operator<? extends OperatorDesc>>();
            parentOps.add(par);
            dummyStoreOp.setParentOperators(parentOps);
            aliasToSink.put(srcs[i], dummyStoreOp);
            smbJop.getParentOperators().remove(i);
            smbJop.getParentOperators().add(i, dummyStoreOp);
        }
    }
    smbJoinDesc.setAliasToSink(aliasToSink);
    List<Operator<? extends OperatorDesc>> childOps = mapJoinOp.getChildOperators();
    for (int i = 0; i < childOps.size(); i++) {
        Operator<? extends OperatorDesc> child = childOps.get(i);
        int index = child.getParentOperators().indexOf(mapJoinOp);
        child.getParentOperators().remove(index);
        child.getParentOperators().add(index, smbJop);
    }
    // Data structures coming from QBJoinTree
    smbJop.getConf().setQBJoinTreeProps(mapJoinOp.getConf());
    // 
    pGraphContext.getSmbMapJoinOps().add(smbJop);
    pGraphContext.getMapJoinOps().remove(mapJoinOp);
    return smbJop;
}
Also used : ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) DummyStoreOperator(org.apache.hadoop.hive.ql.exec.DummyStoreOperator) DummyStoreDesc(org.apache.hadoop.hive.ql.plan.DummyStoreDesc) SMBJoinDesc(org.apache.hadoop.hive.ql.plan.SMBJoinDesc) HashMap(java.util.HashMap) DummyStoreOperator(org.apache.hadoop.hive.ql.exec.DummyStoreOperator) ArrayList(java.util.ArrayList) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc)

Aggregations

SMBMapJoinOperator (org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator)19 MapJoinOperator (org.apache.hadoop.hive.ql.exec.MapJoinOperator)13 Operator (org.apache.hadoop.hive.ql.exec.Operator)11 ReduceSinkOperator (org.apache.hadoop.hive.ql.exec.ReduceSinkOperator)11 ArrayList (java.util.ArrayList)8 Path (org.apache.hadoop.fs.Path)8 FileSinkOperator (org.apache.hadoop.hive.ql.exec.FileSinkOperator)8 JoinOperator (org.apache.hadoop.hive.ql.exec.JoinOperator)8 OperatorDesc (org.apache.hadoop.hive.ql.plan.OperatorDesc)8 TableScanOperator (org.apache.hadoop.hive.ql.exec.TableScanOperator)7 UnionOperator (org.apache.hadoop.hive.ql.exec.UnionOperator)7 LinkedHashMap (java.util.LinkedHashMap)4 AbstractMapJoinOperator (org.apache.hadoop.hive.ql.exec.AbstractMapJoinOperator)4 GroupByOperator (org.apache.hadoop.hive.ql.exec.GroupByOperator)4 SelectOperator (org.apache.hadoop.hive.ql.exec.SelectOperator)4 Task (org.apache.hadoop.hive.ql.exec.Task)4 MapRedTask (org.apache.hadoop.hive.ql.exec.mr.MapRedTask)4 MapredWork (org.apache.hadoop.hive.ql.plan.MapredWork)4 TableDesc (org.apache.hadoop.hive.ql.plan.TableDesc)4 HashMap (java.util.HashMap)3