Search in sources :

Example 31 with ReduceWork

use of org.apache.hadoop.hive.ql.plan.ReduceWork in project hive by apache.

the class SparkCrossProductCheck method checkShuffleJoin.

private void checkShuffleJoin(SparkWork sparkWork) throws SemanticException {
    for (ReduceWork reduceWork : sparkWork.getAllReduceWork()) {
        Operator<? extends OperatorDesc> reducer = reduceWork.getReducer();
        if (reducer instanceof JoinOperator || reducer instanceof CommonMergeJoinOperator) {
            Map<Integer, CrossProductCheck.ExtractReduceSinkInfo.Info> rsInfo = new HashMap<Integer, CrossProductCheck.ExtractReduceSinkInfo.Info>();
            for (BaseWork parent : sparkWork.getParents(reduceWork)) {
                rsInfo.putAll(new CrossProductCheck.ExtractReduceSinkInfo(null).analyze(parent));
            }
            checkForCrossProduct(reduceWork.getName(), reducer, rsInfo);
        }
    }
}
Also used : CommonMergeJoinOperator(org.apache.hadoop.hive.ql.exec.CommonMergeJoinOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) HashMap(java.util.HashMap) ReduceWork(org.apache.hadoop.hive.ql.plan.ReduceWork) BaseWork(org.apache.hadoop.hive.ql.plan.BaseWork) CommonMergeJoinOperator(org.apache.hadoop.hive.ql.exec.CommonMergeJoinOperator)

Example 32 with ReduceWork

use of org.apache.hadoop.hive.ql.plan.ReduceWork in project hive by apache.

the class DagUtils method createVertex.

/**
   * Create a vertex from a given work object.
   *
   * @param conf JobConf to be used to this execution unit
   * @param work The instance of BaseWork representing the actual work to be performed
   * by this vertex.
   * @param scratchDir HDFS scratch dir for this execution unit.
   * @param appJarLr Local resource for hive-exec.
   * @param additionalLr
   * @param fileSystem FS corresponding to scratchDir and LocalResources
   * @param ctx This query's context
   * @return Vertex
   */
@SuppressWarnings("deprecation")
public Vertex createVertex(JobConf conf, BaseWork work, Path scratchDir, LocalResource appJarLr, List<LocalResource> additionalLr, FileSystem fileSystem, Context ctx, boolean hasChildren, TezWork tezWork, VertexType vertexType) throws Exception {
    Vertex v = null;
    // BaseWork.
    if (work instanceof MapWork) {
        v = createVertex(conf, (MapWork) work, appJarLr, additionalLr, fileSystem, scratchDir, ctx, vertexType);
    } else if (work instanceof ReduceWork) {
        v = createVertex(conf, (ReduceWork) work, appJarLr, additionalLr, fileSystem, scratchDir, ctx);
    } else if (work instanceof MergeJoinWork) {
        v = createVertex(conf, (MergeJoinWork) work, appJarLr, additionalLr, fileSystem, scratchDir, ctx, vertexType);
    } else {
        // something is seriously wrong if this is happening
        throw new HiveException(ErrorMsg.GENERIC_ERROR.getErrorCodedMsg());
    }
    // initialize stats publisher if necessary
    if (work.isGatheringStats()) {
        StatsPublisher statsPublisher;
        StatsFactory factory = StatsFactory.newFactory(conf);
        if (factory != null) {
            StatsCollectionContext sCntxt = new StatsCollectionContext(conf);
            sCntxt.setStatsTmpDirs(Utilities.getStatsTmpDirs(work, conf));
            statsPublisher = factory.getStatsPublisher();
            if (!statsPublisher.init(sCntxt)) {
                // creating stats table if not exists
                if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_STATS_RELIABLE)) {
                    throw new HiveException(ErrorMsg.STATSPUBLISHER_INITIALIZATION_ERROR.getErrorCodedMsg());
                }
            }
        }
    }
    // final vertices need to have at least one output
    if (!hasChildren) {
        v.addDataSink("out_" + work.getName(), new DataSinkDescriptor(OutputDescriptor.create(MROutput.class.getName()).setUserPayload(TezUtils.createUserPayloadFromConf(conf)), null, null));
    }
    return v;
}
Also used : StatsPublisher(org.apache.hadoop.hive.ql.stats.StatsPublisher) StatsCollectionContext(org.apache.hadoop.hive.ql.stats.StatsCollectionContext) Vertex(org.apache.tez.dag.api.Vertex) PreWarmVertex(org.apache.tez.dag.api.PreWarmVertex) StatsFactory(org.apache.hadoop.hive.ql.stats.StatsFactory) MergeJoinWork(org.apache.hadoop.hive.ql.plan.MergeJoinWork) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) ReduceWork(org.apache.hadoop.hive.ql.plan.ReduceWork) DataSinkDescriptor(org.apache.tez.dag.api.DataSinkDescriptor)

Example 33 with ReduceWork

use of org.apache.hadoop.hive.ql.plan.ReduceWork in project hive by apache.

the class SparkReduceRecordHandler method init.

@Override
@SuppressWarnings("unchecked")
public void init(JobConf job, OutputCollector output, Reporter reporter) throws Exception {
    perfLogger.PerfLogBegin(CLASS_NAME, PerfLogger.SPARK_INIT_OPERATORS);
    super.init(job, output, reporter);
    rowObjectInspector = new ObjectInspector[Byte.MAX_VALUE];
    ObjectInspector[] valueObjectInspector = new ObjectInspector[Byte.MAX_VALUE];
    ObjectInspector keyObjectInspector;
    ReduceWork gWork = Utilities.getReduceWork(job);
    reducer = gWork.getReducer();
    vectorized = gWork.getVectorMode();
    // clear out any parents as reducer is the
    reducer.setParentOperators(null);
    // root
    isTagged = gWork.getNeedsTagging();
    try {
        keyTableDesc = gWork.getKeyDesc();
        inputKeyDeserializer = ReflectionUtils.newInstance(keyTableDesc.getDeserializerClass(), null);
        SerDeUtils.initializeSerDe(inputKeyDeserializer, null, keyTableDesc.getProperties(), null);
        keyObjectInspector = inputKeyDeserializer.getObjectInspector();
        valueTableDesc = new TableDesc[gWork.getTagToValueDesc().size()];
        if (vectorized) {
            final int maxTags = gWork.getTagToValueDesc().size();
            keyStructInspector = (StructObjectInspector) keyObjectInspector;
            batches = new VectorizedRowBatch[maxTags];
            valueStructInspectors = new StructObjectInspector[maxTags];
            valueStringWriters = new List[maxTags];
            keysColumnOffset = keyStructInspector.getAllStructFieldRefs().size();
            buffer = new DataOutputBuffer();
        }
        for (int tag = 0; tag < gWork.getTagToValueDesc().size(); tag++) {
            // We should initialize the SerDe with the TypeInfo when available.
            valueTableDesc[tag] = gWork.getTagToValueDesc().get(tag);
            inputValueDeserializer[tag] = ReflectionUtils.newInstance(valueTableDesc[tag].getDeserializerClass(), null);
            SerDeUtils.initializeSerDe(inputValueDeserializer[tag], null, valueTableDesc[tag].getProperties(), null);
            valueObjectInspector[tag] = inputValueDeserializer[tag].getObjectInspector();
            ArrayList<ObjectInspector> ois = new ArrayList<ObjectInspector>();
            if (vectorized) {
                /* vectorization only works with struct object inspectors */
                valueStructInspectors[tag] = (StructObjectInspector) valueObjectInspector[tag];
                final int totalColumns = keysColumnOffset + valueStructInspectors[tag].getAllStructFieldRefs().size();
                valueStringWriters[tag] = new ArrayList<VectorExpressionWriter>(totalColumns);
                valueStringWriters[tag].addAll(Arrays.asList(VectorExpressionWriterFactory.genVectorStructExpressionWritables(keyStructInspector)));
                valueStringWriters[tag].addAll(Arrays.asList(VectorExpressionWriterFactory.genVectorStructExpressionWritables(valueStructInspectors[tag])));
                rowObjectInspector[tag] = Utilities.constructVectorizedReduceRowOI(keyStructInspector, valueStructInspectors[tag]);
                batches[tag] = gWork.getVectorizedRowBatchCtx().createVectorizedRowBatch();
            } else {
                ois.add(keyObjectInspector);
                ois.add(valueObjectInspector[tag]);
                //reducer.setGroupKeyObjectInspector(keyObjectInspector);
                rowObjectInspector[tag] = ObjectInspectorFactory.getStandardStructObjectInspector(Utilities.reduceFieldNameList, ois);
            }
        }
    } catch (Exception e) {
        throw new RuntimeException(e);
    }
    ExecMapperContext execContext = new ExecMapperContext(job);
    localWork = gWork.getMapRedLocalWork();
    execContext.setJc(jc);
    execContext.setLocalWork(localWork);
    reducer.passExecContext(execContext);
    reducer.setReporter(rp);
    OperatorUtils.setChildrenCollector(Arrays.<Operator<? extends OperatorDesc>>asList(reducer), output);
    // initialize reduce operator tree
    try {
        LOG.info(reducer.dump(0));
        reducer.initialize(jc, rowObjectInspector);
        if (localWork != null) {
            for (Operator<? extends OperatorDesc> dummyOp : localWork.getDummyParentOp()) {
                dummyOp.setExecContext(execContext);
                dummyOp.initialize(jc, null);
            }
        }
    } catch (Throwable e) {
        abort = true;
        if (e instanceof OutOfMemoryError) {
            // Don't create a new object if we are already out of memory
            throw (OutOfMemoryError) e;
        } else {
            throw new RuntimeException("Reduce operator initialization failed", e);
        }
    }
    perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.SPARK_INIT_OPERATORS);
}
Also used : ExecMapperContext(org.apache.hadoop.hive.ql.exec.mr.ExecMapperContext) ObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector) ArrayList(java.util.ArrayList) ReduceWork(org.apache.hadoop.hive.ql.plan.ReduceWork) VectorExpressionWriter(org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpressionWriter) IOException(java.io.IOException) SerDeException(org.apache.hadoop.hive.serde2.SerDeException) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) DataOutputBuffer(org.apache.hadoop.io.DataOutputBuffer)

Example 34 with ReduceWork

use of org.apache.hadoop.hive.ql.plan.ReduceWork in project hive by apache.

the class SparkPlanGenerator method generateParentTran.

// Generate (possibly get from a cached result) parent SparkTran
private SparkTran generateParentTran(SparkPlan sparkPlan, SparkWork sparkWork, BaseWork work) throws Exception {
    if (cloneToWork.containsKey(work)) {
        BaseWork originalWork = cloneToWork.get(work);
        if (workToParentWorkTranMap.containsKey(originalWork)) {
            return workToParentWorkTranMap.get(originalWork);
        }
    }
    SparkTran result;
    if (work instanceof MapWork) {
        result = generateMapInput(sparkPlan, (MapWork) work);
        sparkPlan.addTran(result);
    } else if (work instanceof ReduceWork) {
        List<BaseWork> parentWorks = sparkWork.getParents(work);
        result = generate(sparkPlan, sparkWork.getEdgeProperty(parentWorks.get(0), work), cloneToWork.containsKey(work));
        sparkPlan.addTran(result);
        for (BaseWork parentWork : parentWorks) {
            sparkPlan.connect(workToTranMap.get(parentWork), result);
        }
    } else {
        throw new IllegalStateException("AssertionError: expected either MapWork or ReduceWork, " + "but found " + work.getClass().getName());
    }
    if (cloneToWork.containsKey(work)) {
        workToParentWorkTranMap.put(cloneToWork.get(work), result);
    }
    return result;
}
Also used : MapWork(org.apache.hadoop.hive.ql.plan.MapWork) List(java.util.List) ReduceWork(org.apache.hadoop.hive.ql.plan.ReduceWork) BaseWork(org.apache.hadoop.hive.ql.plan.BaseWork)

Aggregations

ReduceWork (org.apache.hadoop.hive.ql.plan.ReduceWork)34 ArrayList (java.util.ArrayList)12 MapWork (org.apache.hadoop.hive.ql.plan.MapWork)12 Path (org.apache.hadoop.fs.Path)11 Operator (org.apache.hadoop.hive.ql.exec.Operator)10 BaseWork (org.apache.hadoop.hive.ql.plan.BaseWork)10 ReduceSinkDesc (org.apache.hadoop.hive.ql.plan.ReduceSinkDesc)9 MapJoinOperator (org.apache.hadoop.hive.ql.exec.MapJoinOperator)8 ReduceSinkOperator (org.apache.hadoop.hive.ql.exec.ReduceSinkOperator)8 JoinOperator (org.apache.hadoop.hive.ql.exec.JoinOperator)7 MapredWork (org.apache.hadoop.hive.ql.plan.MapredWork)7 TableScanOperator (org.apache.hadoop.hive.ql.exec.TableScanOperator)6 FileSinkDesc (org.apache.hadoop.hive.ql.plan.FileSinkDesc)6 OperatorDesc (org.apache.hadoop.hive.ql.plan.OperatorDesc)6 SelectDesc (org.apache.hadoop.hive.ql.plan.SelectDesc)6 SMBMapJoinOperator (org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator)5 MapRedTask (org.apache.hadoop.hive.ql.exec.mr.MapRedTask)5 JobConf (org.apache.hadoop.mapred.JobConf)5 IOException (java.io.IOException)4 CommonMergeJoinOperator (org.apache.hadoop.hive.ql.exec.CommonMergeJoinOperator)4