Search in sources :

Example 31 with MapWork

use of org.apache.hadoop.hive.ql.plan.MapWork in project hive by apache.

the class SparkMapRecordHandler method init.

@Override
public <K, V> void init(JobConf job, OutputCollector<K, V> output, Reporter reporter) throws Exception {
    perfLogger.PerfLogBegin(CLASS_NAME, PerfLogger.SPARK_INIT_OPERATORS);
    super.init(job, output, reporter);
    try {
        jc = job;
        execContext = new ExecMapperContext(jc);
        // create map and fetch operators
        MapWork mrwork = Utilities.getMapWork(job);
        CompilationOpContext runtimeCtx = new CompilationOpContext();
        if (mrwork.getVectorMode()) {
            mo = new VectorMapOperator(runtimeCtx);
        } else {
            mo = new MapOperator(runtimeCtx);
        }
        mo.setConf(mrwork);
        // initialize map operator
        mo.initialize(jc, null);
        mo.setChildren(job);
        LOG.info(mo.dump(0));
        // initialize map local work
        localWork = mrwork.getMapRedLocalWork();
        execContext.setLocalWork(localWork);
        MapredContext.init(true, new JobConf(jc));
        MapredContext.get().setReporter(reporter);
        mo.passExecContext(execContext);
        mo.initializeLocalWork(jc);
        mo.initializeMapOperator(jc);
        mo.setReporter(rp);
        if (localWork == null) {
            return;
        }
        // The following code is for mapjoin
        // initialize all the dummy ops
        LOG.info("Initializing dummy operator");
        List<Operator<? extends OperatorDesc>> dummyOps = localWork.getDummyParentOp();
        for (Operator<? extends OperatorDesc> dummyOp : dummyOps) {
            dummyOp.setExecContext(execContext);
            dummyOp.initialize(jc, null);
        }
    } catch (Throwable e) {
        abort = true;
        if (e instanceof OutOfMemoryError) {
            // Don't create a new object if we are already out of memory
            throw (OutOfMemoryError) e;
        } else {
            throw new RuntimeException("Map operator initialization failed: " + e, e);
        }
    }
    perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.SPARK_INIT_OPERATORS);
}
Also used : Operator(org.apache.hadoop.hive.ql.exec.Operator) MapOperator(org.apache.hadoop.hive.ql.exec.MapOperator) AbstractMapOperator(org.apache.hadoop.hive.ql.exec.AbstractMapOperator) VectorMapOperator(org.apache.hadoop.hive.ql.exec.vector.VectorMapOperator) ExecMapperContext(org.apache.hadoop.hive.ql.exec.mr.ExecMapperContext) VectorMapOperator(org.apache.hadoop.hive.ql.exec.vector.VectorMapOperator) MapOperator(org.apache.hadoop.hive.ql.exec.MapOperator) AbstractMapOperator(org.apache.hadoop.hive.ql.exec.AbstractMapOperator) VectorMapOperator(org.apache.hadoop.hive.ql.exec.vector.VectorMapOperator) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) CompilationOpContext(org.apache.hadoop.hive.ql.CompilationOpContext) JobConf(org.apache.hadoop.mapred.JobConf) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc)

Example 32 with MapWork

use of org.apache.hadoop.hive.ql.plan.MapWork in project hive by apache.

the class SparkPlanGenerator method generateMapInput.

@SuppressWarnings("unchecked")
private MapInput generateMapInput(SparkPlan sparkPlan, MapWork mapWork) throws Exception {
    JobConf jobConf = cloneJobConf(mapWork);
    Class ifClass = getInputFormat(jobConf, mapWork);
    sc.sc().setCallSite(CallSite.apply(mapWork.getName(), ""));
    JavaPairRDD<WritableComparable, Writable> hadoopRDD;
    if (mapWork.getNumMapTasks() != null) {
        jobConf.setNumMapTasks(mapWork.getNumMapTasks());
        hadoopRDD = sc.hadoopRDD(jobConf, ifClass, WritableComparable.class, Writable.class, mapWork.getNumMapTasks());
    } else {
        hadoopRDD = sc.hadoopRDD(jobConf, ifClass, WritableComparable.class, Writable.class);
    }
    boolean toCache = false;
    String tables = mapWork.getAllRootOperators().stream().filter(op -> op instanceof TableScanOperator).map(ts -> ((TableScanDesc) ts.getConf()).getAlias()).collect(Collectors.joining(", "));
    String rddName = mapWork.getName() + " (" + tables + ", " + hadoopRDD.getNumPartitions() + (toCache ? ", cached)" : ")");
    // Caching is disabled for MapInput due to HIVE-8920
    MapInput result = new MapInput(sparkPlan, hadoopRDD, toCache, rddName);
    return result;
}
Also used : StatsPublisher(org.apache.hadoop.hive.ql.stats.StatsPublisher) FileSystem(org.apache.hadoop.fs.FileSystem) CallSite(org.apache.spark.util.CallSite) LoggerFactory(org.slf4j.LoggerFactory) StatsCollectionContext(org.apache.hadoop.hive.ql.stats.StatsCollectionContext) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) WritableComparable(org.apache.hadoop.io.WritableComparable) HashMap(java.util.HashMap) Writable(org.apache.hadoop.io.Writable) TableScanDesc(org.apache.hadoop.hive.ql.plan.TableScanDesc) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) Utilities(org.apache.hadoop.hive.ql.exec.Utilities) ExecReducer(org.apache.hadoop.hive.ql.exec.mr.ExecReducer) ReduceWork(org.apache.hadoop.hive.ql.plan.ReduceWork) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) Context(org.apache.hadoop.hive.ql.Context) BaseWork(org.apache.hadoop.hive.ql.plan.BaseWork) PerfLogger(org.apache.hadoop.hive.ql.log.PerfLogger) BucketizedHiveInputFormat(org.apache.hadoop.hive.ql.io.BucketizedHiveInputFormat) Logger(org.slf4j.Logger) HiveConf(org.apache.hadoop.hive.conf.HiveConf) Set(java.util.Set) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) SessionState(org.apache.hadoop.hive.ql.session.SessionState) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) JavaUtils(org.apache.hadoop.hive.common.JavaUtils) MergeFileWork(org.apache.hadoop.hive.ql.io.merge.MergeFileWork) Operator(org.apache.hadoop.hive.ql.exec.Operator) ExecMapper(org.apache.hadoop.hive.ql.exec.mr.ExecMapper) JobConf(org.apache.hadoop.mapred.JobConf) StatsFactory(org.apache.hadoop.hive.ql.stats.StatsFactory) List(java.util.List) SparkEdgeProperty(org.apache.hadoop.hive.ql.plan.SparkEdgeProperty) MergeFileOutputFormat(org.apache.hadoop.hive.ql.io.merge.MergeFileOutputFormat) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) SparkWork(org.apache.hadoop.hive.ql.plan.SparkWork) Preconditions(com.google.common.base.Preconditions) FileOutputFormat(org.apache.hadoop.mapred.FileOutputFormat) MergeFileMapper(org.apache.hadoop.hive.ql.io.merge.MergeFileMapper) ErrorMsg(org.apache.hadoop.hive.ql.ErrorMsg) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) WritableComparable(org.apache.hadoop.io.WritableComparable) TableScanDesc(org.apache.hadoop.hive.ql.plan.TableScanDesc) Writable(org.apache.hadoop.io.Writable) JobConf(org.apache.hadoop.mapred.JobConf)

Example 33 with MapWork

use of org.apache.hadoop.hive.ql.plan.MapWork in project hive by apache.

the class SparkPlanGenerator method cloneJobConf.

@SuppressWarnings({ "unchecked" })
private JobConf cloneJobConf(BaseWork work) throws Exception {
    if (workToJobConf.containsKey(work)) {
        return workToJobConf.get(work);
    }
    JobConf cloned = new JobConf(jobConf);
    // Make sure we'll use a different plan path from the original one
    HiveConf.setVar(cloned, HiveConf.ConfVars.PLAN, "");
    try {
        cloned.setPartitionerClass(JavaUtils.loadClass(HiveConf.getVar(cloned, HiveConf.ConfVars.HIVEPARTITIONER)));
    } catch (ClassNotFoundException e) {
        String msg = "Could not find partitioner class: " + e.getMessage() + " which is specified by: " + HiveConf.ConfVars.HIVEPARTITIONER.varname;
        throw new IllegalArgumentException(msg, e);
    }
    if (work instanceof MapWork) {
        MapWork mapWork = (MapWork) work;
        cloned.setBoolean("mapred.task.is.map", true);
        List<Path> inputPaths = Utilities.getInputPaths(cloned, mapWork, scratchDir, context, false);
        Utilities.setInputPaths(cloned, inputPaths);
        Utilities.setMapWork(cloned, mapWork, scratchDir, false);
        Utilities.createTmpDirs(cloned, mapWork);
        if (work instanceof MergeFileWork) {
            MergeFileWork mergeFileWork = (MergeFileWork) work;
            cloned.set(Utilities.MAPRED_MAPPER_CLASS, MergeFileMapper.class.getName());
            cloned.set("mapred.input.format.class", mergeFileWork.getInputformat());
            cloned.setClass("mapred.output.format.class", MergeFileOutputFormat.class, FileOutputFormat.class);
        } else {
            cloned.set(Utilities.MAPRED_MAPPER_CLASS, ExecMapper.class.getName());
        }
        if (mapWork.getMaxSplitSize() != null) {
            HiveConf.setLongVar(cloned, HiveConf.ConfVars.MAPREDMAXSPLITSIZE, mapWork.getMaxSplitSize());
        }
        if (mapWork.getMinSplitSize() != null) {
            HiveConf.setLongVar(cloned, HiveConf.ConfVars.MAPREDMINSPLITSIZE, mapWork.getMinSplitSize());
        }
        if (mapWork.getMinSplitSizePerNode() != null) {
            HiveConf.setLongVar(cloned, HiveConf.ConfVars.MAPREDMINSPLITSIZEPERNODE, mapWork.getMinSplitSizePerNode());
        }
        if (mapWork.getMinSplitSizePerRack() != null) {
            HiveConf.setLongVar(cloned, HiveConf.ConfVars.MAPREDMINSPLITSIZEPERRACK, mapWork.getMinSplitSizePerRack());
        }
        // remember the JobConf cloned for each MapWork, so we won't clone for it again
        workToJobConf.put(work, cloned);
    } else if (work instanceof ReduceWork) {
        cloned.setBoolean("mapred.task.is.map", false);
        Utilities.setReduceWork(cloned, (ReduceWork) work, scratchDir, false);
        Utilities.createTmpDirs(cloned, (ReduceWork) work);
        cloned.set(Utilities.MAPRED_REDUCER_CLASS, ExecReducer.class.getName());
    }
    return cloned;
}
Also used : Path(org.apache.hadoop.fs.Path) MergeFileWork(org.apache.hadoop.hive.ql.io.merge.MergeFileWork) MergeFileMapper(org.apache.hadoop.hive.ql.io.merge.MergeFileMapper) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) ReduceWork(org.apache.hadoop.hive.ql.plan.ReduceWork) JobConf(org.apache.hadoop.mapred.JobConf) ExecMapper(org.apache.hadoop.hive.ql.exec.mr.ExecMapper)

Example 34 with MapWork

use of org.apache.hadoop.hive.ql.plan.MapWork in project hive by apache.

the class SparkPlanGenerator method generate.

private SparkTran generate(BaseWork work, SparkWork sparkWork) throws Exception {
    initStatsPublisher(work);
    JobConf newJobConf = cloneJobConf(work);
    checkSpecs(work, newJobConf);
    byte[] confBytes = KryoSerializer.serializeJobConf(newJobConf);
    boolean caching = isCachingWork(work, sparkWork);
    if (work instanceof MapWork) {
        // Create tmp dir for MergeFileWork
        if (work instanceof MergeFileWork) {
            Path outputPath = ((MergeFileWork) work).getOutputDir();
            Path tempOutPath = Utilities.toTempPath(outputPath);
            FileSystem fs = outputPath.getFileSystem(jobConf);
            try {
                if (!fs.exists(tempOutPath)) {
                    fs.mkdirs(tempOutPath);
                }
            } catch (IOException e) {
                throw new RuntimeException("Can't make path " + outputPath + " : " + e.getMessage());
            }
        }
        MapTran mapTran = new MapTran(caching, work.getName());
        HiveMapFunction mapFunc = new HiveMapFunction(confBytes, sparkReporter);
        mapTran.setMapFunction(mapFunc);
        return mapTran;
    } else if (work instanceof ReduceWork) {
        ReduceTran reduceTran = new ReduceTran(caching, work.getName());
        HiveReduceFunction reduceFunc = new HiveReduceFunction(confBytes, sparkReporter);
        reduceTran.setReduceFunction(reduceFunc);
        return reduceTran;
    } else {
        throw new IllegalStateException("AssertionError: expected either MapWork or ReduceWork, " + "but found " + work.getClass().getName());
    }
}
Also used : Path(org.apache.hadoop.fs.Path) MergeFileWork(org.apache.hadoop.hive.ql.io.merge.MergeFileWork) IOException(java.io.IOException) ReduceWork(org.apache.hadoop.hive.ql.plan.ReduceWork) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) FileSystem(org.apache.hadoop.fs.FileSystem) JobConf(org.apache.hadoop.mapred.JobConf)

Example 35 with MapWork

use of org.apache.hadoop.hive.ql.plan.MapWork in project hive by apache.

the class DagUtils method createVertex.

/**
 * Create a vertex from a given work object.
 *
 * @param conf JobConf to be used to this execution unit
 * @param work The instance of BaseWork representing the actual work to be performed
 * by this vertex.
 * @param scratchDir HDFS scratch dir for this execution unit.
 * @param fileSystem FS corresponding to scratchDir and LocalResources
 * @param ctx This query's context
 * @return Vertex
 */
@SuppressWarnings("deprecation")
public Vertex createVertex(JobConf conf, BaseWork work, Path scratchDir, FileSystem fileSystem, Context ctx, boolean hasChildren, TezWork tezWork, VertexType vertexType, Map<String, LocalResource> localResources) throws Exception {
    Vertex v = null;
    // BaseWork.
    if (work instanceof MapWork) {
        v = createVertex(conf, (MapWork) work, fileSystem, scratchDir, ctx, vertexType, localResources);
    } else if (work instanceof ReduceWork) {
        v = createVertex(conf, (ReduceWork) work, fileSystem, scratchDir, ctx, localResources);
    } else if (work instanceof MergeJoinWork) {
        v = createVertex(conf, (MergeJoinWork) work, fileSystem, scratchDir, ctx, vertexType, localResources);
        // set VertexManagerPlugin if whether it's a cross product destination vertex
        List<String> crossProductSources = new ArrayList<>();
        for (BaseWork parentWork : tezWork.getParents(work)) {
            if (tezWork.getEdgeType(parentWork, work) == EdgeType.XPROD_EDGE) {
                crossProductSources.add(parentWork.getName());
            }
        }
        if (!crossProductSources.isEmpty()) {
            CartesianProductConfig cpConfig = new CartesianProductConfig(crossProductSources);
            v.setVertexManagerPlugin(VertexManagerPluginDescriptor.create(CartesianProductVertexManager.class.getName()).setUserPayload(cpConfig.toUserPayload(new TezConfiguration(conf))));
        // parallelism shouldn't be set for cartesian product vertex
        }
    } else {
        // something is seriously wrong if this is happening
        throw new HiveException(ErrorMsg.GENERIC_ERROR.getErrorCodedMsg());
    }
    // initialize stats publisher if necessary
    if (work.isGatheringStats()) {
        StatsPublisher statsPublisher;
        StatsFactory factory = StatsFactory.newFactory(conf);
        if (factory != null) {
            StatsCollectionContext sCntxt = new StatsCollectionContext(conf);
            sCntxt.setStatsTmpDirs(Utilities.getStatsTmpDirs(work, conf));
            statsPublisher = factory.getStatsPublisher();
            if (!statsPublisher.init(sCntxt)) {
                // creating stats table if not exists
                if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_STATS_RELIABLE)) {
                    throw new HiveException(ErrorMsg.STATSPUBLISHER_INITIALIZATION_ERROR.getErrorCodedMsg());
                }
            }
        }
    }
    // final vertices need to have at least one output
    if (!hasChildren) {
        v.addDataSink("out_" + work.getName(), new DataSinkDescriptor(OutputDescriptor.create(MROutput.class.getName()).setUserPayload(TezUtils.createUserPayloadFromConf(conf)), null, null));
    }
    return v;
}
Also used : StatsCollectionContext(org.apache.hadoop.hive.ql.stats.StatsCollectionContext) Vertex(org.apache.tez.dag.api.Vertex) PreWarmVertex(org.apache.tez.dag.api.PreWarmVertex) MergeJoinWork(org.apache.hadoop.hive.ql.plan.MergeJoinWork) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) ArrayList(java.util.ArrayList) CartesianProductVertexManager(org.apache.tez.runtime.library.cartesianproduct.CartesianProductVertexManager) ReduceWork(org.apache.hadoop.hive.ql.plan.ReduceWork) DataSinkDescriptor(org.apache.tez.dag.api.DataSinkDescriptor) StatsPublisher(org.apache.hadoop.hive.ql.stats.StatsPublisher) StatsFactory(org.apache.hadoop.hive.ql.stats.StatsFactory) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) CartesianProductConfig(org.apache.tez.runtime.library.cartesianproduct.CartesianProductConfig) BaseWork(org.apache.hadoop.hive.ql.plan.BaseWork) TezConfiguration(org.apache.tez.dag.api.TezConfiguration)

Aggregations

MapWork (org.apache.hadoop.hive.ql.plan.MapWork)79 ArrayList (java.util.ArrayList)25 Path (org.apache.hadoop.fs.Path)24 TableScanOperator (org.apache.hadoop.hive.ql.exec.TableScanOperator)23 Operator (org.apache.hadoop.hive.ql.exec.Operator)21 OperatorDesc (org.apache.hadoop.hive.ql.plan.OperatorDesc)17 TableDesc (org.apache.hadoop.hive.ql.plan.TableDesc)16 JobConf (org.apache.hadoop.mapred.JobConf)15 Test (org.junit.Test)15 BaseWork (org.apache.hadoop.hive.ql.plan.BaseWork)14 ReduceWork (org.apache.hadoop.hive.ql.plan.ReduceWork)14 MapredWork (org.apache.hadoop.hive.ql.plan.MapredWork)13 Serializable (java.io.Serializable)12 FileSinkOperator (org.apache.hadoop.hive.ql.exec.FileSinkOperator)12 Task (org.apache.hadoop.hive.ql.exec.Task)12 PartitionDesc (org.apache.hadoop.hive.ql.plan.PartitionDesc)12 Context (org.apache.hadoop.hive.ql.Context)11 LinkedHashMap (java.util.LinkedHashMap)10 FileSystem (org.apache.hadoop.fs.FileSystem)10 ConditionalTask (org.apache.hadoop.hive.ql.exec.ConditionalTask)10