Search in sources :

Example 11 with MapredLocalWork

use of org.apache.hadoop.hive.ql.plan.MapredLocalWork in project hive by apache.

the class ExecDriver method execute.

/**
   * Execute a query plan using Hadoop.
   */
@SuppressWarnings({ "deprecation", "unchecked" })
@Override
public int execute(DriverContext driverContext) {
    IOPrepareCache ioPrepareCache = IOPrepareCache.get();
    ioPrepareCache.clear();
    boolean success = true;
    Context ctx = driverContext.getCtx();
    boolean ctxCreated = false;
    Path emptyScratchDir;
    JobClient jc = null;
    if (driverContext.isShutdown()) {
        LOG.warn("Task was cancelled");
        return 5;
    }
    MapWork mWork = work.getMapWork();
    ReduceWork rWork = work.getReduceWork();
    try {
        if (ctx == null) {
            ctx = new Context(job);
            ctxCreated = true;
        }
        emptyScratchDir = ctx.getMRTmpPath();
        FileSystem fs = emptyScratchDir.getFileSystem(job);
        fs.mkdirs(emptyScratchDir);
    } catch (IOException e) {
        e.printStackTrace();
        console.printError("Error launching map-reduce job", "\n" + org.apache.hadoop.util.StringUtils.stringifyException(e));
        return 5;
    }
    HiveFileFormatUtils.prepareJobOutput(job);
    //See the javadoc on HiveOutputFormatImpl and HadoopShims.prepareJobOutput()
    job.setOutputFormat(HiveOutputFormatImpl.class);
    job.setMapperClass(ExecMapper.class);
    job.setMapOutputKeyClass(HiveKey.class);
    job.setMapOutputValueClass(BytesWritable.class);
    try {
        String partitioner = HiveConf.getVar(job, ConfVars.HIVEPARTITIONER);
        job.setPartitionerClass(JavaUtils.loadClass(partitioner));
    } catch (ClassNotFoundException e) {
        throw new RuntimeException(e.getMessage(), e);
    }
    propagateSplitSettings(job, mWork);
    job.setNumReduceTasks(rWork != null ? rWork.getNumReduceTasks().intValue() : 0);
    job.setReducerClass(ExecReducer.class);
    // set input format information if necessary
    setInputAttributes(job);
    // Turn on speculative execution for reducers
    boolean useSpeculativeExecReducers = HiveConf.getBoolVar(job, HiveConf.ConfVars.HIVESPECULATIVEEXECREDUCERS);
    job.setBoolean(MRJobConfig.REDUCE_SPECULATIVE, useSpeculativeExecReducers);
    String inpFormat = HiveConf.getVar(job, HiveConf.ConfVars.HIVEINPUTFORMAT);
    if (mWork.isUseBucketizedHiveInputFormat()) {
        inpFormat = BucketizedHiveInputFormat.class.getName();
    }
    LOG.info("Using " + inpFormat);
    try {
        job.setInputFormat(JavaUtils.loadClass(inpFormat));
    } catch (ClassNotFoundException e) {
        throw new RuntimeException(e.getMessage(), e);
    }
    // No-Op - we don't really write anything here ..
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    int returnVal = 0;
    boolean noName = StringUtils.isEmpty(job.get(MRJobConfig.JOB_NAME));
    if (noName) {
        // This is for a special case to ensure unit tests pass
        job.set(MRJobConfig.JOB_NAME, "JOB" + Utilities.randGen.nextInt());
    }
    try {
        MapredLocalWork localwork = mWork.getMapRedLocalWork();
        if (localwork != null && localwork.hasStagedAlias()) {
            if (!ShimLoader.getHadoopShims().isLocalMode(job)) {
                Path localPath = localwork.getTmpPath();
                Path hdfsPath = mWork.getTmpHDFSPath();
                FileSystem hdfs = hdfsPath.getFileSystem(job);
                FileSystem localFS = localPath.getFileSystem(job);
                FileStatus[] hashtableFiles = localFS.listStatus(localPath);
                int fileNumber = hashtableFiles.length;
                String[] fileNames = new String[fileNumber];
                for (int i = 0; i < fileNumber; i++) {
                    fileNames[i] = hashtableFiles[i].getPath().getName();
                }
                //package and compress all the hashtable files to an archive file
                String stageId = this.getId();
                String archiveFileName = Utilities.generateTarFileName(stageId);
                localwork.setStageID(stageId);
                CompressionUtils.tar(localPath.toUri().getPath(), fileNames, archiveFileName);
                Path archivePath = Utilities.generateTarPath(localPath, stageId);
                LOG.info("Archive " + hashtableFiles.length + " hash table files to " + archivePath);
                //upload archive file to hdfs
                Path hdfsFilePath = Utilities.generateTarPath(hdfsPath, stageId);
                short replication = (short) job.getInt("mapred.submit.replication", 10);
                hdfs.copyFromLocalFile(archivePath, hdfsFilePath);
                hdfs.setReplication(hdfsFilePath, replication);
                LOG.info("Upload 1 archive file  from" + archivePath + " to: " + hdfsFilePath);
                //add the archive file to distributed cache
                DistributedCache.createSymlink(job);
                DistributedCache.addCacheArchive(hdfsFilePath.toUri(), job);
                LOG.info("Add 1 archive file to distributed cache. Archive file: " + hdfsFilePath.toUri());
            }
        }
        work.configureJobConf(job);
        List<Path> inputPaths = Utilities.getInputPaths(job, mWork, emptyScratchDir, ctx, false);
        Utilities.setInputPaths(job, inputPaths);
        Utilities.setMapRedWork(job, work, ctx.getMRTmpPath());
        if (mWork.getSamplingType() > 0 && rWork != null && job.getNumReduceTasks() > 1) {
            try {
                handleSampling(ctx, mWork, job);
                job.setPartitionerClass(HiveTotalOrderPartitioner.class);
            } catch (IllegalStateException e) {
                console.printInfo("Not enough sampling data.. Rolling back to single reducer task");
                rWork.setNumReduceTasks(1);
                job.setNumReduceTasks(1);
            } catch (Exception e) {
                LOG.error("Sampling error", e);
                console.printError(e.toString(), "\n" + org.apache.hadoop.util.StringUtils.stringifyException(e));
                rWork.setNumReduceTasks(1);
                job.setNumReduceTasks(1);
            }
        }
        jc = new JobClient(job);
        // make this client wait if job tracker is not behaving well.
        Throttle.checkJobTracker(job, LOG);
        if (mWork.isGatheringStats() || (rWork != null && rWork.isGatheringStats())) {
            // initialize stats publishing table
            StatsPublisher statsPublisher;
            StatsFactory factory = StatsFactory.newFactory(job);
            if (factory != null) {
                statsPublisher = factory.getStatsPublisher();
                List<String> statsTmpDir = Utilities.getStatsTmpDirs(mWork, job);
                if (rWork != null) {
                    statsTmpDir.addAll(Utilities.getStatsTmpDirs(rWork, job));
                }
                StatsCollectionContext sc = new StatsCollectionContext(job);
                sc.setStatsTmpDirs(statsTmpDir);
                if (!statsPublisher.init(sc)) {
                    // creating stats table if not exists
                    if (HiveConf.getBoolVar(job, HiveConf.ConfVars.HIVE_STATS_RELIABLE)) {
                        throw new HiveException(ErrorMsg.STATSPUBLISHER_INITIALIZATION_ERROR.getErrorCodedMsg());
                    }
                }
            }
        }
        Utilities.createTmpDirs(job, mWork);
        Utilities.createTmpDirs(job, rWork);
        SessionState ss = SessionState.get();
        if (HiveConf.getVar(job, HiveConf.ConfVars.HIVE_EXECUTION_ENGINE).equals("tez") && ss != null) {
            TezSessionState session = ss.getTezSession();
            TezSessionPoolManager.getInstance().closeIfNotDefault(session, true);
        }
        HiveConfUtil.updateJobCredentialProviders(job);
        // Finally SUBMIT the JOB!
        if (driverContext.isShutdown()) {
            LOG.warn("Task was cancelled");
            return 5;
        }
        rj = jc.submitJob(job);
        if (driverContext.isShutdown()) {
            LOG.warn("Task was cancelled");
            if (rj != null) {
                rj.killJob();
                rj = null;
            }
            return 5;
        }
        this.jobID = rj.getJobID();
        updateStatusInQueryDisplay();
        returnVal = jobExecHelper.progress(rj, jc, ctx);
        success = (returnVal == 0);
    } catch (Exception e) {
        e.printStackTrace();
        setException(e);
        String mesg = " with exception '" + Utilities.getNameMessage(e) + "'";
        if (rj != null) {
            mesg = "Ended Job = " + rj.getJobID() + mesg;
        } else {
            mesg = "Job Submission failed" + mesg;
        }
        // Has to use full name to make sure it does not conflict with
        // org.apache.commons.lang.StringUtils
        console.printError(mesg, "\n" + org.apache.hadoop.util.StringUtils.stringifyException(e));
        success = false;
        returnVal = 1;
    } finally {
        Utilities.clearWork(job);
        try {
            if (ctxCreated) {
                ctx.clear();
            }
            if (rj != null) {
                if (returnVal != 0) {
                    rj.killJob();
                }
                jobID = rj.getID().toString();
            }
            if (jc != null) {
                jc.close();
            }
        } catch (Exception e) {
            LOG.warn("Failed while cleaning up ", e);
        } finally {
            HadoopJobExecHelper.runningJobs.remove(rj);
        }
    }
    // get the list of Dynamic partition paths
    try {
        if (rj != null) {
            if (mWork.getAliasToWork() != null) {
                for (Operator<? extends OperatorDesc> op : mWork.getAliasToWork().values()) {
                    op.jobClose(job, success);
                }
            }
            if (rWork != null) {
                rWork.getReducer().jobClose(job, success);
            }
        }
    } catch (Exception e) {
        // jobClose needs to execute successfully otherwise fail task
        if (success) {
            setException(e);
            success = false;
            returnVal = 3;
            String mesg = "Job Commit failed with exception '" + Utilities.getNameMessage(e) + "'";
            console.printError(mesg, "\n" + org.apache.hadoop.util.StringUtils.stringifyException(e));
        }
    }
    return (returnVal);
}
Also used : SessionState(org.apache.hadoop.hive.ql.session.SessionState) TezSessionState(org.apache.hadoop.hive.ql.exec.tez.TezSessionState) IOPrepareCache(org.apache.hadoop.hive.ql.io.IOPrepareCache) FileStatus(org.apache.hadoop.fs.FileStatus) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) JobClient(org.apache.hadoop.mapred.JobClient) StatsPublisher(org.apache.hadoop.hive.ql.stats.StatsPublisher) StatsFactory(org.apache.hadoop.hive.ql.stats.StatsFactory) FileSystem(org.apache.hadoop.fs.FileSystem) StatsCollectionContext(org.apache.hadoop.hive.ql.stats.StatsCollectionContext) Context(org.apache.hadoop.hive.ql.Context) CompilationOpContext(org.apache.hadoop.hive.ql.CompilationOpContext) DriverContext(org.apache.hadoop.hive.ql.DriverContext) Path(org.apache.hadoop.fs.Path) StatsCollectionContext(org.apache.hadoop.hive.ql.stats.StatsCollectionContext) BucketizedHiveInputFormat(org.apache.hadoop.hive.ql.io.BucketizedHiveInputFormat) ReduceWork(org.apache.hadoop.hive.ql.plan.ReduceWork) IOException(java.io.IOException) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) LogInitializationException(org.apache.hadoop.hive.common.LogUtils.LogInitializationException) IOException(java.io.IOException) TezSessionState(org.apache.hadoop.hive.ql.exec.tez.TezSessionState) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) MapredLocalWork(org.apache.hadoop.hive.ql.plan.MapredLocalWork)

Example 12 with MapredLocalWork

use of org.apache.hadoop.hive.ql.plan.MapredLocalWork in project hive by apache.

the class MapredLocalTask method executeInChildVM.

public int executeInChildVM(DriverContext driverContext) {
    // execute in child jvm
    try {
        // generate the cmd line to run in the child jvm
        Context ctx = driverContext.getCtx();
        String hiveJar = conf.getJar();
        String hadoopExec = conf.getVar(HiveConf.ConfVars.HADOOPBIN);
        conf.setVar(ConfVars.HIVEADDEDJARS, Utilities.getResourceFiles(conf, SessionState.ResourceType.JAR));
        // write out the plan to a local file
        Path planPath = new Path(ctx.getLocalTmpPath(), "plan.xml");
        MapredLocalWork plan = getWork();
        LOG.info("Generating plan file " + planPath.toString());
        OutputStream out = null;
        try {
            out = FileSystem.getLocal(conf).create(planPath);
            SerializationUtilities.serializePlan(plan, out);
            out.close();
            out = null;
        } finally {
            IOUtils.closeQuietly(out);
        }
        String isSilent = "true".equalsIgnoreCase(System.getProperty("test.silent")) ? "-nolog" : "";
        String jarCmd;
        jarCmd = hiveJar + " " + ExecDriver.class.getName();
        String hiveConfArgs = ExecDriver.generateCmdLine(conf, ctx);
        String cmdLine = hadoopExec + " jar " + jarCmd + " -localtask -plan " + planPath.toString() + " " + isSilent + " " + hiveConfArgs;
        String workDir = (new File(".")).getCanonicalPath();
        String files = Utilities.getResourceFiles(conf, SessionState.ResourceType.FILE);
        if (!files.isEmpty()) {
            cmdLine = cmdLine + " -files " + files;
            workDir = ctx.getLocalTmpPath().toUri().getPath();
            if (!(new File(workDir)).mkdir()) {
                throw new IOException("Cannot create tmp working dir: " + workDir);
            }
            for (String f : StringUtils.split(files, ',')) {
                Path p = new Path(f);
                String target = p.toUri().getPath();
                String link = workDir + Path.SEPARATOR + p.getName();
                if (FileUtil.symLink(target, link) != 0) {
                    throw new IOException("Cannot link to added file: " + target + " from: " + link);
                }
            }
        }
        // Inherit Java system variables
        String hadoopOpts;
        StringBuilder sb = new StringBuilder();
        Properties p = System.getProperties();
        for (String element : HIVE_SYS_PROP) {
            if (p.containsKey(element)) {
                sb.append(" -D" + element + "=" + p.getProperty(element));
            }
        }
        hadoopOpts = sb.toString();
        // Inherit the environment variables
        String[] env;
        Map<String, String> variables = new HashMap<String, String>(System.getenv());
        // The user can specify the hadoop memory
        // if ("local".equals(conf.getVar(HiveConf.ConfVars.HADOOPJT))) {
        // if we are running in local mode - then the amount of memory used
        // by the child jvm can no longer default to the memory used by the
        // parent jvm
        // int hadoopMem = conf.getIntVar(HiveConf.ConfVars.HIVEHADOOPMAXMEM);
        int hadoopMem = conf.getIntVar(HiveConf.ConfVars.HIVEHADOOPMAXMEM);
        if (hadoopMem == 0) {
            // remove env var that would default child jvm to use parent's memory
            // as default. child jvm would use default memory for a hadoop client
            variables.remove(HADOOP_MEM_KEY);
        } else {
            // user specified the memory for local mode hadoop run
            console.printInfo(" set heap size\t" + hadoopMem + "MB");
            variables.put(HADOOP_MEM_KEY, String.valueOf(hadoopMem));
        }
        // } else {
        // nothing to do - we are not running in local mode - only submitting
        // the job via a child process. in this case it's appropriate that the
        // child jvm use the same memory as the parent jvm
        // }
        //Set HADOOP_USER_NAME env variable for child process, so that
        // it also runs with hadoop permissions for the user the job is running as
        // This will be used by hadoop only in unsecure(/non kerberos) mode
        String endUserName = Utils.getUGI().getShortUserName();
        LOG.debug("setting HADOOP_USER_NAME\t" + endUserName);
        variables.put("HADOOP_USER_NAME", endUserName);
        if (variables.containsKey(HADOOP_OPTS_KEY)) {
            variables.put(HADOOP_OPTS_KEY, variables.get(HADOOP_OPTS_KEY) + hadoopOpts);
        } else {
            variables.put(HADOOP_OPTS_KEY, hadoopOpts);
        }
        //Hiveserver2 using "-hiveconf hive.hadoop.classpath=%HIVE_LIB%". This is to combine path(s).
        if (HiveConf.getVar(conf, HiveConf.ConfVars.HIVE_HADOOP_CLASSPATH) != null) {
            if (variables.containsKey("HADOOP_CLASSPATH")) {
                variables.put("HADOOP_CLASSPATH", variables.get("HADOOP_CLASSPATH") + ";" + HiveConf.getVar(conf, HiveConf.ConfVars.HIVE_HADOOP_CLASSPATH));
            } else {
                variables.put("HADOOP_CLASSPATH", HiveConf.getVar(conf, HiveConf.ConfVars.HIVE_HADOOP_CLASSPATH));
            }
        }
        if (variables.containsKey(MapRedTask.HIVE_DEBUG_RECURSIVE)) {
            MapRedTask.configureDebugVariablesForChildJVM(variables);
        }
        if (UserGroupInformation.isSecurityEnabled() && UserGroupInformation.isLoginKeytabBased()) {
            //If kerberos security is enabled, and HS2 doAs is enabled,
            // then additional params need to be set so that the command is run as
            // intended user
            secureDoAs = new SecureCmdDoAs(conf);
            secureDoAs.addEnv(variables);
        }
        // have different settings from those of HiveServer2.
        if (variables.containsKey(HIVE_LOCAL_TASK_CHILD_OPTS_KEY)) {
            String childOpts = variables.get(HIVE_LOCAL_TASK_CHILD_OPTS_KEY);
            if (childOpts == null) {
                childOpts = "";
            }
            String clientOpts = variables.put(HADOOP_CLIENT_OPTS, childOpts);
            String tmp = variables.get(HADOOP_OPTS_KEY);
            if (tmp != null && !StringUtils.isBlank(clientOpts)) {
                tmp = tmp.replace(clientOpts, childOpts);
                variables.put(HADOOP_OPTS_KEY, tmp);
            }
        }
        env = new String[variables.size()];
        int pos = 0;
        for (Map.Entry<String, String> entry : variables.entrySet()) {
            String name = entry.getKey();
            String value = entry.getValue();
            env[pos++] = name + "=" + value;
            LOG.debug("Setting env: " + name + "=" + LogUtils.maskIfPassword(name, value));
        }
        LOG.info("Executing: " + cmdLine);
        // Run ExecDriver in another JVM
        executor = Runtime.getRuntime().exec(cmdLine, env, new File(workDir));
        CachingPrintStream errPrintStream = new CachingPrintStream(System.err);
        StreamPrinter outPrinter;
        StreamPrinter errPrinter;
        OperationLog operationLog = OperationLog.getCurrentOperationLog();
        if (operationLog != null) {
            outPrinter = new StreamPrinter(executor.getInputStream(), null, System.out, operationLog.getPrintStream());
            errPrinter = new StreamPrinter(executor.getErrorStream(), null, errPrintStream, operationLog.getPrintStream());
        } else {
            outPrinter = new StreamPrinter(executor.getInputStream(), null, System.out);
            errPrinter = new StreamPrinter(executor.getErrorStream(), null, errPrintStream);
        }
        outPrinter.start();
        errPrinter.start();
        int exitVal = jobExecHelper.progressLocal(executor, getId());
        // wait for stream threads to finish
        outPrinter.join();
        errPrinter.join();
        if (exitVal != 0) {
            LOG.error("Execution failed with exit status: " + exitVal);
            if (SessionState.get() != null) {
                SessionState.get().addLocalMapRedErrors(getId(), errPrintStream.getOutput());
            }
        } else {
            LOG.info("Execution completed successfully");
        }
        return exitVal;
    } catch (Exception e) {
        LOG.error("Exception: ", e);
        return (1);
    } finally {
        if (secureDoAs != null) {
            secureDoAs.close();
        }
    }
}
Also used : Context(org.apache.hadoop.hive.ql.Context) CompilationOpContext(org.apache.hadoop.hive.ql.CompilationOpContext) BucketMapJoinContext(org.apache.hadoop.hive.ql.plan.BucketMapJoinContext) DriverContext(org.apache.hadoop.hive.ql.DriverContext) Path(org.apache.hadoop.fs.Path) HashMap(java.util.HashMap) SecureCmdDoAs(org.apache.hadoop.hive.ql.exec.SecureCmdDoAs) OutputStream(java.io.OutputStream) OperationLog(org.apache.hadoop.hive.ql.session.OperationLog) IOException(java.io.IOException) Properties(java.util.Properties) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) MapJoinMemoryExhaustionException(org.apache.hadoop.hive.ql.exec.mapjoin.MapJoinMemoryExhaustionException) IOException(java.io.IOException) CachingPrintStream(org.apache.hadoop.hive.common.io.CachingPrintStream) StreamPrinter(org.apache.hive.common.util.StreamPrinter) MapredLocalWork(org.apache.hadoop.hive.ql.plan.MapredLocalWork) File(java.io.File) Map(java.util.Map) HashMap(java.util.HashMap)

Example 13 with MapredLocalWork

use of org.apache.hadoop.hive.ql.plan.MapredLocalWork in project hive by apache.

the class HashTableLoader method loadDirectly.

private void loadDirectly(MapJoinTableContainer[] mapJoinTables, String inputFileName) throws Exception {
    MapredLocalWork localWork = context.getLocalWork();
    List<Operator<?>> directWorks = localWork.getDirectFetchOp().get(joinOp);
    if (directWorks == null || directWorks.isEmpty()) {
        return;
    }
    JobConf job = new JobConf(hconf);
    MapredLocalTask localTask = new MapredLocalTask(localWork, job, false);
    HashTableSinkOperator sink = new TemporaryHashSinkOperator(new CompilationOpContext(), desc);
    sink.setParentOperators(new ArrayList<Operator<? extends OperatorDesc>>(directWorks));
    for (Operator<?> operator : directWorks) {
        if (operator != null) {
            operator.setChildOperators(Arrays.<Operator<? extends OperatorDesc>>asList(sink));
        }
    }
    localTask.setExecContext(context);
    localTask.startForward(inputFileName);
    MapJoinTableContainer[] tables = sink.getMapJoinTables();
    for (int i = 0; i < sink.getNumParent(); i++) {
        if (sink.getParentOperators().get(i) != null) {
            mapJoinTables[i] = tables[i];
        }
    }
    Arrays.fill(tables, null);
}
Also used : MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) HashTableSinkOperator(org.apache.hadoop.hive.ql.exec.HashTableSinkOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) TemporaryHashSinkOperator(org.apache.hadoop.hive.ql.exec.TemporaryHashSinkOperator) MapredLocalTask(org.apache.hadoop.hive.ql.exec.mr.MapredLocalTask) HashTableSinkOperator(org.apache.hadoop.hive.ql.exec.HashTableSinkOperator) TemporaryHashSinkOperator(org.apache.hadoop.hive.ql.exec.TemporaryHashSinkOperator) CompilationOpContext(org.apache.hadoop.hive.ql.CompilationOpContext) MapredLocalWork(org.apache.hadoop.hive.ql.plan.MapredLocalWork) MapJoinTableContainer(org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainer) JobConf(org.apache.hadoop.mapred.JobConf) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc)

Example 14 with MapredLocalWork

use of org.apache.hadoop.hive.ql.plan.MapredLocalWork in project hive by apache.

the class HashTableLoader method load.

@Override
public void load(MapJoinTableContainer[] mapJoinTables, MapJoinTableContainerSerDe[] mapJoinTableSerdes) throws HiveException {
    // Note: it's possible that a MJ operator is in a ReduceWork, in which case the
    // currentInputPath will be null. But, since currentInputPath is only interesting
    // for bucket join case, and for bucket join the MJ operator will always be in
    // a MapWork, this should be OK.
    String currentInputPath = context.getCurrentInputPath() == null ? null : context.getCurrentInputPath().toString();
    LOG.info("******* Load from HashTable for input file: " + currentInputPath);
    MapredLocalWork localWork = context.getLocalWork();
    try {
        if (localWork.getDirectFetchOp() != null) {
            loadDirectly(mapJoinTables, currentInputPath);
        }
        // All HashTables share the same base dir,
        // which is passed in as the tmp path
        Path baseDir = localWork.getTmpPath();
        if (baseDir == null) {
            return;
        }
        FileSystem fs = FileSystem.get(baseDir.toUri(), hconf);
        BucketMapJoinContext mapJoinCtx = localWork.getBucketMapjoinContext();
        boolean firstContainer = true;
        boolean useOptimizedContainer = !useFastContainer && HiveConf.getBoolVar(hconf, HiveConf.ConfVars.HIVEMAPJOINUSEOPTIMIZEDTABLE);
        for (int pos = 0; pos < mapJoinTables.length; pos++) {
            if (pos == desc.getPosBigTable() || mapJoinTables[pos] != null) {
                continue;
            }
            if (useOptimizedContainer) {
                MapJoinObjectSerDeContext keyCtx = mapJoinTableSerdes[pos].getKeyContext();
                ObjectInspector keyOI = keyCtx.getSerDe().getObjectInspector();
                if (!MapJoinBytesTableContainer.isSupportedKey(keyOI)) {
                    if (firstContainer) {
                        LOG.warn("Not using optimized table container." + "Only a subset of mapjoin keys is supported.");
                        useOptimizedContainer = false;
                        HiveConf.setBoolVar(hconf, HiveConf.ConfVars.HIVEMAPJOINUSEOPTIMIZEDTABLE, false);
                    } else {
                        throw new HiveException("Only a subset of mapjoin keys is supported.");
                    }
                }
            }
            firstContainer = false;
            String bigInputPath = currentInputPath;
            if (currentInputPath != null && mapJoinCtx != null) {
                if (!desc.isBucketMapJoin()) {
                    bigInputPath = null;
                } else {
                    Set<String> aliases = ((SparkBucketMapJoinContext) mapJoinCtx).getPosToAliasMap().get(pos);
                    String alias = aliases.iterator().next();
                    // Any one small table input path
                    String smallInputPath = mapJoinCtx.getAliasBucketFileNameMapping().get(alias).get(bigInputPath).get(0);
                    bigInputPath = mapJoinCtx.getMappingBigFile(alias, smallInputPath);
                }
            }
            String fileName = localWork.getBucketFileName(bigInputPath);
            Path path = Utilities.generatePath(baseDir, desc.getDumpFilePrefix(), (byte) pos, fileName);
            mapJoinTables[pos] = load(fs, path, mapJoinTableSerdes[pos]);
        }
    } catch (Exception e) {
        throw new HiveException(e);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) ObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) FileSystem(org.apache.hadoop.fs.FileSystem) SparkBucketMapJoinContext(org.apache.hadoop.hive.ql.plan.SparkBucketMapJoinContext) BucketMapJoinContext(org.apache.hadoop.hive.ql.plan.BucketMapJoinContext) MapJoinObjectSerDeContext(org.apache.hadoop.hive.ql.exec.persistence.MapJoinObjectSerDeContext) MapredLocalWork(org.apache.hadoop.hive.ql.plan.MapredLocalWork) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException)

Example 15 with MapredLocalWork

use of org.apache.hadoop.hive.ql.plan.MapredLocalWork in project hive by apache.

the class CommonJoinTaskDispatcher method mergeMapJoinTaskIntoItsChildMapRedTask.

/*
   * A task and its child task has been converted from join to mapjoin.
   * See if the two tasks can be merged.
   */
private void mergeMapJoinTaskIntoItsChildMapRedTask(MapRedTask mapJoinTask, Configuration conf) throws SemanticException {
    // If so, check if we can merge mapJoinTask into that child.
    if (mapJoinTask.getChildTasks() == null || mapJoinTask.getChildTasks().size() > 1) {
        // child-tasks in which case we don't want to do anything.
        return;
    }
    Task<? extends Serializable> childTask = mapJoinTask.getChildTasks().get(0);
    if (!(childTask instanceof MapRedTask)) {
        // Nothing to do if it is not a MapReduce task.
        return;
    }
    MapRedTask childMapRedTask = (MapRedTask) childTask;
    MapWork mapJoinMapWork = mapJoinTask.getWork().getMapWork();
    MapWork childMapWork = childMapRedTask.getWork().getMapWork();
    Map<String, Operator<? extends OperatorDesc>> mapJoinAliasToWork = mapJoinMapWork.getAliasToWork();
    if (mapJoinAliasToWork.size() > 1) {
        // Do not merge if the MapredWork of MapJoin has multiple input aliases.
        return;
    }
    Entry<String, Operator<? extends OperatorDesc>> mapJoinAliasToWorkEntry = mapJoinAliasToWork.entrySet().iterator().next();
    String mapJoinAlias = mapJoinAliasToWorkEntry.getKey();
    TableScanOperator mapJoinTaskTableScanOperator = OperatorUtils.findSingleOperator(mapJoinAliasToWorkEntry.getValue(), TableScanOperator.class);
    if (mapJoinTaskTableScanOperator == null) {
        throw new SemanticException("Expected a " + TableScanOperator.getOperatorName() + " operator as the work associated with alias " + mapJoinAlias + ". Found a " + mapJoinAliasToWork.get(mapJoinAlias).getName() + " operator.");
    }
    FileSinkOperator mapJoinTaskFileSinkOperator = OperatorUtils.findSingleOperator(mapJoinTaskTableScanOperator, FileSinkOperator.class);
    if (mapJoinTaskFileSinkOperator == null) {
        throw new SemanticException("Cannot find the " + FileSinkOperator.getOperatorName() + " operator at the last operator of the MapJoin Task.");
    }
    // The mapJoinTaskFileSinkOperator writes to a different directory
    Path childMRPath = mapJoinTaskFileSinkOperator.getConf().getDirName();
    List<String> childMRAliases = childMapWork.getPathToAliases().get(childMRPath);
    if (childMRAliases == null || childMRAliases.size() != 1) {
        return;
    }
    String childMRAlias = childMRAliases.get(0);
    // Sanity check to make sure there is no alias conflict after merge.
    for (Entry<Path, ArrayList<String>> entry : childMapWork.getPathToAliases().entrySet()) {
        Path path = entry.getKey();
        List<String> aliases = entry.getValue();
        if (path.equals(childMRPath)) {
            continue;
        }
        if (aliases.contains(mapJoinAlias)) {
            // alias confict should not happen here.
            return;
        }
    }
    MapredLocalWork mapJoinLocalWork = mapJoinMapWork.getMapRedLocalWork();
    MapredLocalWork childLocalWork = childMapWork.getMapRedLocalWork();
    if ((mapJoinLocalWork != null && mapJoinLocalWork.getBucketMapjoinContext() != null) || (childLocalWork != null && childLocalWork.getBucketMapjoinContext() != null)) {
        // We should relax this constraint with a follow-up jira.
        return;
    }
    // is under the limit.
    if (!isLocalTableTotalSizeUnderLimitAfterMerge(conf, mapJoinLocalWork, childLocalWork)) {
        // Do not merge.
        return;
    }
    TableScanOperator childMRTaskTableScanOperator = OperatorUtils.findSingleOperator(childMapWork.getAliasToWork().get(childMRAlias.toString()), TableScanOperator.class);
    if (childMRTaskTableScanOperator == null) {
        throw new SemanticException("Expected a " + TableScanOperator.getOperatorName() + " operator as the work associated with alias " + childMRAlias + ". Found a " + childMapWork.getAliasToWork().get(childMRAlias).getName() + " operator.");
    }
    List<Operator<? extends OperatorDesc>> parentsInMapJoinTask = mapJoinTaskFileSinkOperator.getParentOperators();
    List<Operator<? extends OperatorDesc>> childrenInChildMRTask = childMRTaskTableScanOperator.getChildOperators();
    if (parentsInMapJoinTask.size() > 1 || childrenInChildMRTask.size() > 1) {
        // Do not merge if we do not know how to connect two operator trees.
        return;
    }
    // Step 2: Merge mapJoinTask into the Map-side of its child.
    // Step 2.1: Connect the operator trees of two MapRedTasks.
    Operator<? extends OperatorDesc> parentInMapJoinTask = parentsInMapJoinTask.get(0);
    Operator<? extends OperatorDesc> childInChildMRTask = childrenInChildMRTask.get(0);
    parentInMapJoinTask.replaceChild(mapJoinTaskFileSinkOperator, childInChildMRTask);
    childInChildMRTask.replaceParent(childMRTaskTableScanOperator, parentInMapJoinTask);
    // Step 2.2: Replace the corresponding part childMRWork's MapWork.
    GenMapRedUtils.replaceMapWork(mapJoinAlias, childMRAlias.toString(), mapJoinMapWork, childMapWork);
    // Step 2.3: Fill up stuff in local work
    if (mapJoinLocalWork != null) {
        if (childLocalWork == null) {
            childMapWork.setMapRedLocalWork(mapJoinLocalWork);
        } else {
            childLocalWork.getAliasToFetchWork().putAll(mapJoinLocalWork.getAliasToFetchWork());
            childLocalWork.getAliasToWork().putAll(mapJoinLocalWork.getAliasToWork());
        }
    }
    // Step 2.4: Remove this MapJoin task
    List<Task<? extends Serializable>> parentTasks = mapJoinTask.getParentTasks();
    mapJoinTask.setParentTasks(null);
    mapJoinTask.setChildTasks(null);
    childMapRedTask.getParentTasks().remove(mapJoinTask);
    if (parentTasks != null) {
        childMapRedTask.getParentTasks().addAll(parentTasks);
        for (Task<? extends Serializable> parentTask : parentTasks) {
            parentTask.getChildTasks().remove(mapJoinTask);
            if (!parentTask.getChildTasks().contains(childMapRedTask)) {
                parentTask.getChildTasks().add(childMapRedTask);
            }
        }
    } else {
        if (physicalContext.getRootTasks().contains(mapJoinTask)) {
            physicalContext.removeFromRootTask(mapJoinTask);
            if (childMapRedTask.getParentTasks() != null && childMapRedTask.getParentTasks().size() == 0 && !physicalContext.getRootTasks().contains(childMapRedTask)) {
                physicalContext.addToRootTask(childMapRedTask);
            }
        }
    }
    if (childMapRedTask.getParentTasks().size() == 0) {
        childMapRedTask.setParentTasks(null);
    }
}
Also used : LateralViewForwardOperator(org.apache.hadoop.hive.ql.exec.LateralViewForwardOperator) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) Path(org.apache.hadoop.fs.Path) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) ConditionalTask(org.apache.hadoop.hive.ql.exec.ConditionalTask) Task(org.apache.hadoop.hive.ql.exec.Task) MapRedTask(org.apache.hadoop.hive.ql.exec.mr.MapRedTask) Serializable(java.io.Serializable) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) ArrayList(java.util.ArrayList) MapRedTask(org.apache.hadoop.hive.ql.exec.mr.MapRedTask) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) MapredLocalWork(org.apache.hadoop.hive.ql.plan.MapredLocalWork) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc) SemanticException(org.apache.hadoop.hive.ql.parse.SemanticException)

Aggregations

MapredLocalWork (org.apache.hadoop.hive.ql.plan.MapredLocalWork)15 Path (org.apache.hadoop.fs.Path)12 HiveException (org.apache.hadoop.hive.ql.metadata.HiveException)8 Operator (org.apache.hadoop.hive.ql.exec.Operator)6 OperatorDesc (org.apache.hadoop.hive.ql.plan.OperatorDesc)6 ArrayList (java.util.ArrayList)5 LinkedHashMap (java.util.LinkedHashMap)5 Map (java.util.Map)5 MapJoinOperator (org.apache.hadoop.hive.ql.exec.MapJoinOperator)5 FetchWork (org.apache.hadoop.hive.ql.plan.FetchWork)5 PartitionDesc (org.apache.hadoop.hive.ql.plan.PartitionDesc)5 IOException (java.io.IOException)4 HashMap (java.util.HashMap)4 FileSystem (org.apache.hadoop.fs.FileSystem)4 CompilationOpContext (org.apache.hadoop.hive.ql.CompilationOpContext)4 SemanticException (org.apache.hadoop.hive.ql.parse.SemanticException)4 BucketMapJoinContext (org.apache.hadoop.hive.ql.plan.BucketMapJoinContext)4 DriverContext (org.apache.hadoop.hive.ql.DriverContext)3 FileSinkOperator (org.apache.hadoop.hive.ql.exec.FileSinkOperator)3 JoinOperator (org.apache.hadoop.hive.ql.exec.JoinOperator)3