Search in sources :

Example 11 with MapRedTask

use of org.apache.hadoop.hive.ql.exec.mr.MapRedTask in project hive by apache.

the class IndexWhereProcessor method process.

@Override
public /**
   * Process a node of the operator tree. This matches on the rule in IndexWhereTaskDispatcher
   */
Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx, Object... nodeOutputs) throws SemanticException {
    TableScanOperator operator = (TableScanOperator) nd;
    List<Node> opChildren = operator.getChildren();
    TableScanDesc operatorDesc = operator.getConf();
    if (operatorDesc == null || !tsToIndices.containsKey(operator)) {
        return null;
    }
    List<Index> indexes = tsToIndices.get(operator);
    ExprNodeDesc predicate = operatorDesc.getFilterExpr();
    IndexWhereProcCtx context = (IndexWhereProcCtx) procCtx;
    ParseContext pctx = context.getParseContext();
    LOG.info("Processing predicate for index optimization");
    if (predicate == null) {
        LOG.info("null predicate pushed down");
        return null;
    }
    LOG.info(predicate.getExprString());
    // check if we have tsToIndices on all partitions in this table scan
    Set<Partition> queryPartitions;
    try {
        queryPartitions = IndexUtils.checkPartitionsCoveredByIndex(operator, pctx, indexes);
        if (queryPartitions == null) {
            // partitions not covered
            return null;
        }
    } catch (HiveException e) {
        LOG.error("Fatal Error: problem accessing metastore", e);
        throw new SemanticException(e);
    }
    // we can only process MapReduce tasks to check input size
    if (!context.getCurrentTask().isMapRedTask()) {
        return null;
    }
    MapRedTask currentTask = (MapRedTask) context.getCurrentTask();
    // get potential reentrant index queries from each index
    Map<Index, HiveIndexQueryContext> queryContexts = new HashMap<Index, HiveIndexQueryContext>();
    // make sure we have an index on the table being scanned
    TableDesc tblDesc = operator.getTableDesc();
    Map<String, List<Index>> indexesByType = new HashMap<String, List<Index>>();
    for (Index indexOnTable : indexes) {
        if (indexesByType.get(indexOnTable.getIndexHandlerClass()) == null) {
            List<Index> newType = new ArrayList<Index>();
            newType.add(indexOnTable);
            indexesByType.put(indexOnTable.getIndexHandlerClass(), newType);
        } else {
            indexesByType.get(indexOnTable.getIndexHandlerClass()).add(indexOnTable);
        }
    }
    // choose index type with most tsToIndices of the same type on the table
    // TODO HIVE-2130 This would be a good place for some sort of cost based choice?
    List<Index> bestIndexes = indexesByType.values().iterator().next();
    for (List<Index> indexTypes : indexesByType.values()) {
        if (bestIndexes.size() < indexTypes.size()) {
            bestIndexes = indexTypes;
        }
    }
    // rewrite index queries for the chosen index type
    HiveIndexQueryContext tmpQueryContext = new HiveIndexQueryContext();
    tmpQueryContext.setQueryPartitions(queryPartitions);
    rewriteForIndexes(predicate, bestIndexes, pctx, currentTask, tmpQueryContext);
    List<Task<?>> indexTasks = tmpQueryContext.getQueryTasks();
    if (indexTasks != null && indexTasks.size() > 0) {
        queryContexts.put(bestIndexes.get(0), tmpQueryContext);
    }
    // choose an index rewrite to use
    if (queryContexts.size() > 0) {
        // TODO HIVE-2130 This would be a good place for some sort of cost based choice?
        Index chosenIndex = queryContexts.keySet().iterator().next();
        // modify the parse context to use indexing
        // we need to delay this until we choose one index so that we don't attempt to modify pctx multiple times
        HiveIndexQueryContext queryContext = queryContexts.get(chosenIndex);
        // prepare the map reduce job to use indexing
        MapWork work = currentTask.getWork().getMapWork();
        work.setInputformat(queryContext.getIndexInputFormat());
        work.addIndexIntermediateFile(queryContext.getIndexIntermediateFile());
        // modify inputs based on index query
        Set<ReadEntity> inputs = pctx.getSemanticInputs();
        inputs.addAll(queryContext.getAdditionalSemanticInputs());
        List<Task<?>> chosenRewrite = queryContext.getQueryTasks();
        // add dependencies so index query runs first
        insertIndexQuery(pctx, context, chosenRewrite);
    }
    return null;
}
Also used : HiveIndexQueryContext(org.apache.hadoop.hive.ql.index.HiveIndexQueryContext) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Task(org.apache.hadoop.hive.ql.exec.Task) MapRedTask(org.apache.hadoop.hive.ql.exec.mr.MapRedTask) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) HashMap(java.util.HashMap) Node(org.apache.hadoop.hive.ql.lib.Node) ArrayList(java.util.ArrayList) Index(org.apache.hadoop.hive.metastore.api.Index) MapRedTask(org.apache.hadoop.hive.ql.exec.mr.MapRedTask) ArrayList(java.util.ArrayList) List(java.util.List) ExprNodeDesc(org.apache.hadoop.hive.ql.plan.ExprNodeDesc) SemanticException(org.apache.hadoop.hive.ql.parse.SemanticException) Partition(org.apache.hadoop.hive.ql.metadata.Partition) TableScanDesc(org.apache.hadoop.hive.ql.plan.TableScanDesc) ReadEntity(org.apache.hadoop.hive.ql.hooks.ReadEntity) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) ParseContext(org.apache.hadoop.hive.ql.parse.ParseContext) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc)

Example 12 with MapRedTask

use of org.apache.hadoop.hive.ql.exec.mr.MapRedTask in project hive by apache.

the class Utilities method reworkMapRedWork.

/**
 * The check here is kind of not clean. It first use a for loop to go through
 * all input formats, and choose the ones that extend ReworkMapredInputFormat
 * to a set. And finally go through the ReworkMapredInputFormat set, and call
 * rework for each one.
 *
 * Technically all these can be avoided if all Hive's input formats can share
 * a same interface. As in today's hive and Hadoop, it is not possible because
 * a lot of Hive's input formats are in Hadoop's code. And most of Hadoop's
 * input formats just extend InputFormat interface.
 *
 * @param task
 * @param reworkMapredWork
 * @param conf
 * @throws SemanticException
 */
public static void reworkMapRedWork(Task<? extends Serializable> task, boolean reworkMapredWork, HiveConf conf) throws SemanticException {
    if (reworkMapredWork && (task instanceof MapRedTask)) {
        try {
            MapredWork mapredWork = ((MapRedTask) task).getWork();
            Set<Class<? extends InputFormat>> reworkInputFormats = new HashSet<Class<? extends InputFormat>>();
            for (PartitionDesc part : mapredWork.getMapWork().getPathToPartitionInfo().values()) {
                Class<? extends InputFormat> inputFormatCls = part.getInputFileFormatClass();
                if (ReworkMapredInputFormat.class.isAssignableFrom(inputFormatCls)) {
                    reworkInputFormats.add(inputFormatCls);
                }
            }
            if (reworkInputFormats.size() > 0) {
                for (Class<? extends InputFormat> inputFormatCls : reworkInputFormats) {
                    ReworkMapredInputFormat inst = (ReworkMapredInputFormat) ReflectionUtil.newInstance(inputFormatCls, null);
                    inst.rework(conf, mapredWork);
                }
            }
        } catch (IOException e) {
            throw new SemanticException(e);
        }
    }
}
Also used : MapRedTask(org.apache.hadoop.hive.ql.exec.mr.MapRedTask) MapredWork(org.apache.hadoop.hive.ql.plan.MapredWork) SequenceFileInputFormat(org.apache.hadoop.mapred.SequenceFileInputFormat) ReworkMapredInputFormat(org.apache.hadoop.hive.ql.io.ReworkMapredInputFormat) ContentSummaryInputFormat(org.apache.hadoop.hive.ql.io.ContentSummaryInputFormat) InputFormat(org.apache.hadoop.mapred.InputFormat) FileInputFormat(org.apache.hadoop.mapred.FileInputFormat) TextInputFormat(org.apache.hadoop.mapred.TextInputFormat) OneNullRowInputFormat(org.apache.hadoop.hive.ql.io.OneNullRowInputFormat) HiveInputFormat(org.apache.hadoop.hive.ql.io.HiveInputFormat) ReworkMapredInputFormat(org.apache.hadoop.hive.ql.io.ReworkMapredInputFormat) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) IOException(java.io.IOException) HashSet(java.util.HashSet) SemanticException(org.apache.hadoop.hive.ql.parse.SemanticException)

Example 13 with MapRedTask

use of org.apache.hadoop.hive.ql.exec.mr.MapRedTask in project hive by apache.

the class TestGenMapRedUtilsCreateConditionalTask method setUp.

@Before
public void setUp() {
    dummyMRTask = new MapRedTask();
    SessionState.start(hiveConf);
}
Also used : MapRedTask(org.apache.hadoop.hive.ql.exec.mr.MapRedTask) Before(org.junit.Before)

Example 14 with MapRedTask

use of org.apache.hadoop.hive.ql.exec.mr.MapRedTask in project hive by apache.

the class SamplingOptimizer method resolve.

public PhysicalContext resolve(PhysicalContext pctx) throws SemanticException {
    for (Task<?> task : pctx.getRootTasks()) {
        if (!(task instanceof MapRedTask) || !((MapRedTask) task).getWork().isFinalMapRed()) {
            // this could be replaced by bucketing on RS + bucketed fetcher for next MR
            continue;
        }
        MapredWork mrWork = ((MapRedTask) task).getWork();
        MapWork mapWork = mrWork.getMapWork();
        ReduceWork reduceWork = mrWork.getReduceWork();
        if (reduceWork == null || reduceWork.getNumReduceTasks() != 1 || mapWork.getAliasToWork().size() != 1 || mapWork.getSamplingType() > 0 || reduceWork.getReducer() == null) {
            continue;
        }
        // GROUPBY operator in reducer may not be processed in parallel. Skip optimizing.
        if (OperatorUtils.findSingleOperator(reduceWork.getReducer(), GroupByOperator.class) != null) {
            continue;
        }
        Operator<?> operator = mapWork.getAliasToWork().values().iterator().next();
        if (!(operator instanceof TableScanOperator)) {
            continue;
        }
        TableScanOperator tsop = (TableScanOperator) operator;
        Table tbl = tsop.getConf().getTableMetadata();
        if (tbl == null) {
            continue;
        }
        if (AcidUtils.isInsertOnlyTable(tbl.getParameters())) {
            // sampler will limit the input to the the correct directories, but we don't care about MR.
            continue;
        }
        ReduceSinkOperator child = OperatorUtils.findSingleOperator(operator, ReduceSinkOperator.class);
        if (child == null || child.getConf().getNumReducers() != 1 || !child.getConf().getPartitionCols().isEmpty()) {
            continue;
        }
        child.getConf().setNumReducers(-1);
        reduceWork.setNumReduceTasks(-1);
        mapWork.setSamplingType(MapWork.SAMPLING_ON_START);
    }
    return pctx;
}
Also used : MapRedTask(org.apache.hadoop.hive.ql.exec.mr.MapRedTask) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) GroupByOperator(org.apache.hadoop.hive.ql.exec.GroupByOperator) Table(org.apache.hadoop.hive.ql.metadata.Table) MapredWork(org.apache.hadoop.hive.ql.plan.MapredWork) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) ReduceWork(org.apache.hadoop.hive.ql.plan.ReduceWork)

Example 15 with MapRedTask

use of org.apache.hadoop.hive.ql.exec.mr.MapRedTask in project hive by apache.

the class MoveTask method inferTaskInformation.

private void inferTaskInformation(TaskInformation ti) {
    // (Either standard, local, or a merge)
    while (ti.task.getParentTasks() != null && ti.task.getParentTasks().size() == 1) {
        ti.task = (Task) ti.task.getParentTasks().get(0);
        // If it was a merge task or a local map reduce task, nothing can be inferred
        if (ti.task instanceof MergeFileTask || ti.task instanceof MapredLocalTask) {
            break;
        }
        // the directory this move task is moving
        if (ti.task instanceof MapRedTask) {
            MapredWork work = (MapredWork) ti.task.getWork();
            MapWork mapWork = work.getMapWork();
            ti.bucketCols = mapWork.getBucketedColsByDirectory().get(ti.path);
            ti.sortCols = mapWork.getSortedColsByDirectory().get(ti.path);
            if (work.getReduceWork() != null) {
                ti.numBuckets = work.getReduceWork().getNumReduceTasks();
            }
            if (ti.bucketCols != null || ti.sortCols != null) {
                // operator that writes the final output)
                assert work.isFinalMapRed();
            }
            break;
        }
        // condition for merging is not met, see GenMRFileSink1.
        if (ti.task instanceof MoveTask) {
            MoveTask mt = (MoveTask) ti.task;
            if (mt.getWork().getLoadFileWork() != null) {
                ti.path = mt.getWork().getLoadFileWork().getSourcePath().toUri().toString();
            }
        }
    }
}
Also used : MapRedTask(org.apache.hadoop.hive.ql.exec.mr.MapRedTask) MapredWork(org.apache.hadoop.hive.ql.plan.MapredWork) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) MapredLocalTask(org.apache.hadoop.hive.ql.exec.mr.MapredLocalTask) MergeFileTask(org.apache.hadoop.hive.ql.io.merge.MergeFileTask)

Aggregations

MapRedTask (org.apache.hadoop.hive.ql.exec.mr.MapRedTask)21 Task (org.apache.hadoop.hive.ql.exec.Task)9 MapredWork (org.apache.hadoop.hive.ql.plan.MapredWork)9 ArrayList (java.util.ArrayList)8 ConditionalTask (org.apache.hadoop.hive.ql.exec.ConditionalTask)8 SemanticException (org.apache.hadoop.hive.ql.parse.SemanticException)8 MapWork (org.apache.hadoop.hive.ql.plan.MapWork)8 TableScanOperator (org.apache.hadoop.hive.ql.exec.TableScanOperator)7 List (java.util.List)6 Operator (org.apache.hadoop.hive.ql.exec.Operator)5 Serializable (java.io.Serializable)4 Path (org.apache.hadoop.fs.Path)4 FileSinkOperator (org.apache.hadoop.hive.ql.exec.FileSinkOperator)4 IOException (java.io.IOException)3 HashSet (java.util.HashSet)3 JoinOperator (org.apache.hadoop.hive.ql.exec.JoinOperator)3 TezTask (org.apache.hadoop.hive.ql.exec.tez.TezTask)3 OperatorDesc (org.apache.hadoop.hive.ql.plan.OperatorDesc)3 ReduceWork (org.apache.hadoop.hive.ql.plan.ReduceWork)3 HashMap (java.util.HashMap)2