Search in sources :

Example 1 with MapredLocalWork

use of org.apache.hadoop.hive.ql.plan.MapredLocalWork in project hive by apache.

the class GenMRSkewJoinProcessor method processSkewJoin.

/**
 * Create tasks for processing skew joins. The idea is (HIVE-964) to use
 * separated jobs and map-joins to handle skew joins.
 * <p>
 * <ul>
 * <li>
 * Number of mr jobs to handle skew keys is the number of table minus 1 (we
 * can stream the last table, so big keys in the last table will not be a
 * problem).
 * <li>
 * At runtime in Join, we output big keys in one table into one corresponding
 * directories, and all same keys in other tables into different dirs(one for
 * each table). The directories will look like:
 * <ul>
 * <li>
 * dir-T1-bigkeys(containing big keys in T1), dir-T2-keys(containing keys
 * which is big in T1),dir-T3-keys(containing keys which is big in T1), ...
 * <li>
 * dir-T1-keys(containing keys which is big in T2), dir-T2-bigkeys(containing
 * big keys in T2),dir-T3-keys(containing keys which is big in T2), ...
 * <li>
 * dir-T1-keys(containing keys which is big in T3), dir-T2-keys(containing big
 * keys in T3),dir-T3-bigkeys(containing keys which is big in T3), ... .....
 * </ul>
 * </ul>
 * For each table, we launch one mapjoin job, taking the directory containing
 * big keys in this table and corresponding dirs in other tables as input.
 * (Actally one job for one row in the above.)
 *
 * <p>
 * For more discussions, please check
 * https://issues.apache.org/jira/browse/HIVE-964.
 */
@SuppressWarnings("unchecked")
public static void processSkewJoin(JoinOperator joinOp, Task<? extends Serializable> currTask, ParseContext parseCtx) throws SemanticException {
    // now does not work with outer joins
    if (!GenMRSkewJoinProcessor.skewJoinEnabled(parseCtx.getConf(), joinOp)) {
        return;
    }
    List<Task<? extends Serializable>> children = currTask.getChildTasks();
    Path baseTmpDir = parseCtx.getContext().getMRTmpPath();
    JoinDesc joinDescriptor = joinOp.getConf();
    Map<Byte, List<ExprNodeDesc>> joinValues = joinDescriptor.getExprs();
    int numAliases = joinValues.size();
    Map<Byte, Path> bigKeysDirMap = new HashMap<Byte, Path>();
    Map<Byte, Map<Byte, Path>> smallKeysDirMap = new HashMap<Byte, Map<Byte, Path>>();
    Map<Byte, Path> skewJoinJobResultsDir = new HashMap<Byte, Path>();
    Byte[] tags = joinDescriptor.getTagOrder();
    for (int i = 0; i < numAliases; i++) {
        Byte alias = tags[i];
        bigKeysDirMap.put(alias, getBigKeysDir(baseTmpDir, alias));
        Map<Byte, Path> smallKeysMap = new HashMap<Byte, Path>();
        smallKeysDirMap.put(alias, smallKeysMap);
        for (Byte src2 : tags) {
            if (!src2.equals(alias)) {
                smallKeysMap.put(src2, getSmallKeysDir(baseTmpDir, alias, src2));
            }
        }
        skewJoinJobResultsDir.put(alias, getBigKeysSkewJoinResultDir(baseTmpDir, alias));
    }
    joinDescriptor.setHandleSkewJoin(true);
    joinDescriptor.setBigKeysDirMap(bigKeysDirMap);
    joinDescriptor.setSmallKeysDirMap(smallKeysDirMap);
    joinDescriptor.setSkewKeyDefinition(HiveConf.getIntVar(parseCtx.getConf(), HiveConf.ConfVars.HIVESKEWJOINKEY));
    HashMap<Path, Task<? extends Serializable>> bigKeysDirToTaskMap = new HashMap<Path, Task<? extends Serializable>>();
    List<Serializable> listWorks = new ArrayList<Serializable>();
    List<Task<? extends Serializable>> listTasks = new ArrayList<Task<? extends Serializable>>();
    MapredWork currPlan = (MapredWork) currTask.getWork();
    TableDesc keyTblDesc = (TableDesc) currPlan.getReduceWork().getKeyDesc().clone();
    List<String> joinKeys = Utilities.getColumnNames(keyTblDesc.getProperties());
    List<String> joinKeyTypes = Utilities.getColumnTypes(keyTblDesc.getProperties());
    Map<Byte, TableDesc> tableDescList = new HashMap<Byte, TableDesc>();
    Map<Byte, RowSchema> rowSchemaList = new HashMap<Byte, RowSchema>();
    Map<Byte, List<ExprNodeDesc>> newJoinValues = new HashMap<Byte, List<ExprNodeDesc>>();
    Map<Byte, List<ExprNodeDesc>> newJoinKeys = new HashMap<Byte, List<ExprNodeDesc>>();
    // used for create mapJoinDesc, should be in order
    List<TableDesc> newJoinValueTblDesc = new ArrayList<TableDesc>();
    for (Byte tag : tags) {
        newJoinValueTblDesc.add(null);
    }
    for (int i = 0; i < numAliases; i++) {
        Byte alias = tags[i];
        List<ExprNodeDesc> valueCols = joinValues.get(alias);
        String colNames = "";
        String colTypes = "";
        int columnSize = valueCols.size();
        List<ExprNodeDesc> newValueExpr = new ArrayList<ExprNodeDesc>();
        List<ExprNodeDesc> newKeyExpr = new ArrayList<ExprNodeDesc>();
        ArrayList<ColumnInfo> columnInfos = new ArrayList<ColumnInfo>();
        boolean first = true;
        for (int k = 0; k < columnSize; k++) {
            TypeInfo type = valueCols.get(k).getTypeInfo();
            // any name, it does not matter.
            String newColName = i + "_VALUE_" + k;
            ColumnInfo columnInfo = new ColumnInfo(newColName, type, alias.toString(), false);
            columnInfos.add(columnInfo);
            newValueExpr.add(new ExprNodeColumnDesc(columnInfo));
            if (!first) {
                colNames = colNames + ",";
                colTypes = colTypes + ",";
            }
            first = false;
            colNames = colNames + newColName;
            colTypes = colTypes + valueCols.get(k).getTypeString();
        }
        // we are putting join keys at last part of the spilled table
        for (int k = 0; k < joinKeys.size(); k++) {
            if (!first) {
                colNames = colNames + ",";
                colTypes = colTypes + ",";
            }
            first = false;
            colNames = colNames + joinKeys.get(k);
            colTypes = colTypes + joinKeyTypes.get(k);
            ColumnInfo columnInfo = new ColumnInfo(joinKeys.get(k), TypeInfoFactory.getPrimitiveTypeInfo(joinKeyTypes.get(k)), alias.toString(), false);
            columnInfos.add(columnInfo);
            newKeyExpr.add(new ExprNodeColumnDesc(columnInfo));
        }
        newJoinValues.put(alias, newValueExpr);
        newJoinKeys.put(alias, newKeyExpr);
        tableDescList.put(alias, Utilities.getTableDesc(colNames, colTypes));
        rowSchemaList.put(alias, new RowSchema(columnInfos));
        // construct value table Desc
        String valueColNames = "";
        String valueColTypes = "";
        first = true;
        for (int k = 0; k < columnSize; k++) {
            // any name, it does not matter.
            String newColName = i + "_VALUE_" + k;
            if (!first) {
                valueColNames = valueColNames + ",";
                valueColTypes = valueColTypes + ",";
            }
            valueColNames = valueColNames + newColName;
            valueColTypes = valueColTypes + valueCols.get(k).getTypeString();
            first = false;
        }
        newJoinValueTblDesc.set(Byte.valueOf((byte) i), Utilities.getTableDesc(valueColNames, valueColTypes));
    }
    joinDescriptor.setSkewKeysValuesTables(tableDescList);
    joinDescriptor.setKeyTableDesc(keyTblDesc);
    for (int i = 0; i < numAliases - 1; i++) {
        Byte src = tags[i];
        MapWork newPlan = PlanUtils.getMapRedWork().getMapWork();
        // This code has been only added for testing
        boolean mapperCannotSpanPartns = parseCtx.getConf().getBoolVar(HiveConf.ConfVars.HIVE_MAPPER_CANNOT_SPAN_MULTIPLE_PARTITIONS);
        newPlan.setMapperCannotSpanPartns(mapperCannotSpanPartns);
        MapredWork clonePlan = SerializationUtilities.clonePlan(currPlan);
        Operator<? extends OperatorDesc>[] parentOps = new TableScanOperator[tags.length];
        for (int k = 0; k < tags.length; k++) {
            Operator<? extends OperatorDesc> ts = GenMapRedUtils.createTemporaryTableScanOperator(joinOp.getCompilationOpContext(), rowSchemaList.get((byte) k));
            ((TableScanOperator) ts).setTableDescSkewJoin(tableDescList.get((byte) k));
            parentOps[k] = ts;
        }
        Operator<? extends OperatorDesc> tblScan_op = parentOps[i];
        ArrayList<String> aliases = new ArrayList<String>();
        String alias = src.toString().intern();
        aliases.add(alias);
        Path bigKeyDirPath = bigKeysDirMap.get(src);
        newPlan.addPathToAlias(bigKeyDirPath, aliases);
        newPlan.getAliasToWork().put(alias, tblScan_op);
        PartitionDesc part = new PartitionDesc(tableDescList.get(src), null);
        newPlan.addPathToPartitionInfo(bigKeyDirPath, part);
        newPlan.getAliasToPartnInfo().put(alias, part);
        Operator<? extends OperatorDesc> reducer = clonePlan.getReduceWork().getReducer();
        assert reducer instanceof JoinOperator;
        JoinOperator cloneJoinOp = (JoinOperator) reducer;
        String dumpFilePrefix = "mapfile" + PlanUtils.getCountForMapJoinDumpFilePrefix();
        MapJoinDesc mapJoinDescriptor = new MapJoinDesc(newJoinKeys, keyTblDesc, newJoinValues, newJoinValueTblDesc, newJoinValueTblDesc, joinDescriptor.getOutputColumnNames(), i, joinDescriptor.getConds(), joinDescriptor.getFilters(), joinDescriptor.getNoOuterJoin(), dumpFilePrefix, joinDescriptor.getMemoryMonitorInfo(), joinDescriptor.getInMemoryDataSize());
        mapJoinDescriptor.setTagOrder(tags);
        mapJoinDescriptor.setHandleSkewJoin(false);
        mapJoinDescriptor.setNullSafes(joinDescriptor.getNullSafes());
        mapJoinDescriptor.setColumnExprMap(joinDescriptor.getColumnExprMap());
        MapredLocalWork localPlan = new MapredLocalWork(new LinkedHashMap<String, Operator<? extends OperatorDesc>>(), new LinkedHashMap<String, FetchWork>());
        Map<Byte, Path> smallTblDirs = smallKeysDirMap.get(src);
        for (int j = 0; j < numAliases; j++) {
            if (j == i) {
                continue;
            }
            Byte small_alias = tags[j];
            Operator<? extends OperatorDesc> tblScan_op2 = parentOps[j];
            localPlan.getAliasToWork().put(small_alias.toString(), tblScan_op2);
            Path tblDir = smallTblDirs.get(small_alias);
            localPlan.getAliasToFetchWork().put(small_alias.toString(), new FetchWork(tblDir, tableDescList.get(small_alias)));
        }
        newPlan.setMapRedLocalWork(localPlan);
        // construct a map join and set it as the child operator of tblScan_op
        MapJoinOperator mapJoinOp = (MapJoinOperator) OperatorFactory.getAndMakeChild(joinOp.getCompilationOpContext(), mapJoinDescriptor, (RowSchema) null, parentOps);
        // change the children of the original join operator to point to the map
        // join operator
        List<Operator<? extends OperatorDesc>> childOps = cloneJoinOp.getChildOperators();
        for (Operator<? extends OperatorDesc> childOp : childOps) {
            childOp.replaceParent(cloneJoinOp, mapJoinOp);
        }
        mapJoinOp.setChildOperators(childOps);
        HiveConf jc = new HiveConf(parseCtx.getConf(), GenMRSkewJoinProcessor.class);
        newPlan.setNumMapTasks(HiveConf.getIntVar(jc, HiveConf.ConfVars.HIVESKEWJOINMAPJOINNUMMAPTASK));
        newPlan.setMinSplitSize(HiveConf.getLongVar(jc, HiveConf.ConfVars.HIVESKEWJOINMAPJOINMINSPLIT));
        newPlan.setInputformat(HiveInputFormat.class.getName());
        MapredWork w = new MapredWork();
        w.setMapWork(newPlan);
        Task<? extends Serializable> skewJoinMapJoinTask = TaskFactory.get(w);
        skewJoinMapJoinTask.setFetchSource(currTask.isFetchSource());
        bigKeysDirToTaskMap.put(bigKeyDirPath, skewJoinMapJoinTask);
        listWorks.add(skewJoinMapJoinTask.getWork());
        listTasks.add(skewJoinMapJoinTask);
    }
    if (children != null) {
        for (Task<? extends Serializable> tsk : listTasks) {
            for (Task<? extends Serializable> oldChild : children) {
                tsk.addDependentTask(oldChild);
            }
        }
        currTask.setChildTasks(new ArrayList<Task<? extends Serializable>>());
        for (Task<? extends Serializable> oldChild : children) {
            oldChild.getParentTasks().remove(currTask);
        }
        listTasks.addAll(children);
    }
    ConditionalResolverSkewJoinCtx context = new ConditionalResolverSkewJoinCtx(bigKeysDirToTaskMap, children);
    ConditionalWork cndWork = new ConditionalWork(listWorks);
    ConditionalTask cndTsk = (ConditionalTask) TaskFactory.get(cndWork);
    cndTsk.setListTasks(listTasks);
    cndTsk.setResolver(new ConditionalResolverSkewJoin());
    cndTsk.setResolverCtx(context);
    currTask.setChildTasks(new ArrayList<Task<? extends Serializable>>());
    currTask.addDependentTask(cndTsk);
    return;
}
Also used : MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) ConditionalTask(org.apache.hadoop.hive.ql.exec.ConditionalTask) Task(org.apache.hadoop.hive.ql.exec.Task) Serializable(java.io.Serializable) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) ArrayList(java.util.ArrayList) ConditionalWork(org.apache.hadoop.hive.ql.plan.ConditionalWork) ColumnInfo(org.apache.hadoop.hive.ql.exec.ColumnInfo) MapredWork(org.apache.hadoop.hive.ql.plan.MapredWork) ConditionalTask(org.apache.hadoop.hive.ql.exec.ConditionalTask) ArrayList(java.util.ArrayList) List(java.util.List) HiveConf(org.apache.hadoop.hive.conf.HiveConf) ExprNodeDesc(org.apache.hadoop.hive.ql.plan.ExprNodeDesc) ConditionalResolverSkewJoin(org.apache.hadoop.hive.ql.plan.ConditionalResolverSkewJoin) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) RowSchema(org.apache.hadoop.hive.ql.exec.RowSchema) TypeInfo(org.apache.hadoop.hive.serde2.typeinfo.TypeInfo) MapredLocalWork(org.apache.hadoop.hive.ql.plan.MapredLocalWork) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc) MapJoinDesc(org.apache.hadoop.hive.ql.plan.MapJoinDesc) JoinDesc(org.apache.hadoop.hive.ql.plan.JoinDesc) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) Map(java.util.Map) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) HiveInputFormat(org.apache.hadoop.hive.ql.io.HiveInputFormat) ExprNodeColumnDesc(org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc) Path(org.apache.hadoop.fs.Path) MapJoinDesc(org.apache.hadoop.hive.ql.plan.MapJoinDesc) ConditionalResolverSkewJoinCtx(org.apache.hadoop.hive.ql.plan.ConditionalResolverSkewJoin.ConditionalResolverSkewJoinCtx) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) FetchWork(org.apache.hadoop.hive.ql.plan.FetchWork) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc)

Example 2 with MapredLocalWork

use of org.apache.hadoop.hive.ql.plan.MapredLocalWork in project hive by apache.

the class ExecDriver method execute.

/**
 * Execute a query plan using Hadoop.
 */
@SuppressWarnings({ "deprecation", "unchecked" })
@Override
public int execute() {
    IOPrepareCache ioPrepareCache = IOPrepareCache.get();
    ioPrepareCache.clear();
    boolean success = true;
    boolean ctxCreated = false;
    Path emptyScratchDir;
    JobClient jc = null;
    if (taskQueue.isShutdown()) {
        LOG.warn("Task was cancelled");
        return 5;
    }
    MapWork mWork = work.getMapWork();
    ReduceWork rWork = work.getReduceWork();
    Context ctx = context;
    try {
        if (ctx == null) {
            ctx = new Context(job);
            ctxCreated = true;
        }
        emptyScratchDir = ctx.getMRTmpPath();
        FileSystem fs = emptyScratchDir.getFileSystem(job);
        fs.mkdirs(emptyScratchDir);
    } catch (IOException e) {
        console.printError("Error launching map-reduce job", "\n" + org.apache.hadoop.util.StringUtils.stringifyException(e));
        return 5;
    }
    HiveFileFormatUtils.prepareJobOutput(job);
    // See the javadoc on HiveOutputFormatImpl and HadoopShims.prepareJobOutput()
    job.setOutputFormat(HiveOutputFormatImpl.class);
    job.setMapRunnerClass(ExecMapRunner.class);
    job.setMapperClass(ExecMapper.class);
    job.setMapOutputKeyClass(HiveKey.class);
    job.setMapOutputValueClass(BytesWritable.class);
    try {
        String partitioner = HiveConf.getVar(job, ConfVars.HIVEPARTITIONER);
        job.setPartitionerClass(JavaUtils.loadClass(partitioner));
    } catch (ClassNotFoundException e) {
        throw new RuntimeException(e.getMessage(), e);
    }
    propagateSplitSettings(job, mWork);
    job.setNumReduceTasks(rWork != null ? rWork.getNumReduceTasks() : 0);
    job.setReducerClass(ExecReducer.class);
    // set input format information if necessary
    setInputAttributes(job);
    // HIVE-23354 enforces that MR speculative execution is disabled
    job.setBoolean(MRJobConfig.REDUCE_SPECULATIVE, false);
    job.setBoolean(MRJobConfig.MAP_SPECULATIVE, false);
    String inpFormat = HiveConf.getVar(job, HiveConf.ConfVars.HIVEINPUTFORMAT);
    if (mWork.isUseBucketizedHiveInputFormat()) {
        inpFormat = BucketizedHiveInputFormat.class.getName();
    }
    LOG.info("Using " + inpFormat);
    try {
        job.setInputFormat(JavaUtils.loadClass(inpFormat));
    } catch (ClassNotFoundException e) {
        throw new RuntimeException(e.getMessage(), e);
    }
    // No-Op - we don't really write anything here ..
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    int returnVal = 0;
    boolean noName = StringUtils.isEmpty(job.get(MRJobConfig.JOB_NAME));
    if (noName) {
        // This is for a special case to ensure unit tests pass
        job.set(MRJobConfig.JOB_NAME, "JOB" + ThreadLocalRandom.current().nextInt());
    }
    try {
        MapredLocalWork localwork = mWork.getMapRedLocalWork();
        if (localwork != null && localwork.hasStagedAlias()) {
            if (!ShimLoader.getHadoopShims().isLocalMode(job)) {
                Path localPath = localwork.getTmpPath();
                Path hdfsPath = mWork.getTmpHDFSPath();
                FileSystem hdfs = hdfsPath.getFileSystem(job);
                FileSystem localFS = localPath.getFileSystem(job);
                FileStatus[] hashtableFiles = localFS.listStatus(localPath);
                int fileNumber = hashtableFiles.length;
                String[] fileNames = new String[fileNumber];
                for (int i = 0; i < fileNumber; i++) {
                    fileNames[i] = hashtableFiles[i].getPath().getName();
                }
                // package and compress all the hashtable files to an archive file
                String stageId = this.getId();
                String archiveFileName = Utilities.generateTarFileName(stageId);
                localwork.setStageID(stageId);
                CompressionUtils.tar(localPath.toUri().getPath(), fileNames, archiveFileName);
                Path archivePath = Utilities.generateTarPath(localPath, stageId);
                LOG.info("Archive " + hashtableFiles.length + " hash table files to " + archivePath);
                // upload archive file to hdfs
                Path hdfsFilePath = Utilities.generateTarPath(hdfsPath, stageId);
                short replication = (short) job.getInt("mapred.submit.replication", 10);
                hdfs.copyFromLocalFile(archivePath, hdfsFilePath);
                hdfs.setReplication(hdfsFilePath, replication);
                LOG.info("Upload 1 archive file  from" + archivePath + " to: " + hdfsFilePath);
                // add the archive file to distributed cache
                DistributedCache.createSymlink(job);
                DistributedCache.addCacheArchive(hdfsFilePath.toUri(), job);
                LOG.info("Add 1 archive file to distributed cache. Archive file: " + hdfsFilePath.toUri());
            }
        }
        work.configureJobConf(job);
        List<Path> inputPaths = Utilities.getInputPaths(job, mWork, emptyScratchDir, ctx, false);
        Utilities.setInputPaths(job, inputPaths);
        Utilities.setMapRedWork(job, work, ctx.getMRTmpPath());
        if (mWork.getSamplingType() > 0 && rWork != null && job.getNumReduceTasks() > 1) {
            try {
                handleSampling(ctx, mWork, job);
                job.setPartitionerClass(HiveTotalOrderPartitioner.class);
            } catch (IllegalStateException e) {
                console.printInfo("Not enough sampling data.. Rolling back to single reducer task");
                rWork.setNumReduceTasks(1);
                job.setNumReduceTasks(1);
            } catch (Exception e) {
                LOG.error("Sampling error", e);
                console.printError(e.toString(), "\n" + org.apache.hadoop.util.StringUtils.stringifyException(e));
                rWork.setNumReduceTasks(1);
                job.setNumReduceTasks(1);
            }
        }
        jc = new JobClient(job);
        // make this client wait if job tracker is not behaving well.
        Throttle.checkJobTracker(job, LOG);
        if (mWork.isGatheringStats() || (rWork != null && rWork.isGatheringStats())) {
            // initialize stats publishing table
            StatsPublisher statsPublisher;
            StatsFactory factory = StatsFactory.newFactory(job);
            if (factory != null) {
                statsPublisher = factory.getStatsPublisher();
                List<String> statsTmpDir = Utilities.getStatsTmpDirs(mWork, job);
                if (rWork != null) {
                    statsTmpDir.addAll(Utilities.getStatsTmpDirs(rWork, job));
                }
                StatsCollectionContext sc = new StatsCollectionContext(job);
                sc.setStatsTmpDirs(statsTmpDir);
                if (!statsPublisher.init(sc)) {
                    // creating stats table if not exists
                    if (HiveConf.getBoolVar(job, HiveConf.ConfVars.HIVE_STATS_RELIABLE)) {
                        throw new HiveException(ErrorMsg.STATSPUBLISHER_INITIALIZATION_ERROR.getErrorCodedMsg());
                    }
                }
            }
        }
        Utilities.createTmpDirs(job, mWork);
        Utilities.createTmpDirs(job, rWork);
        SessionState ss = SessionState.get();
        // TODO: why is there a TezSession in MR ExecDriver?
        if (ss != null && HiveConf.getVar(job, ConfVars.HIVE_EXECUTION_ENGINE).equals("tez")) {
            // TODO: this is the only place that uses keepTmpDir. Why?
            TezSessionPoolManager.closeIfNotDefault(ss.getTezSession(), true);
        }
        HiveConfUtil.updateJobCredentialProviders(job);
        // Finally SUBMIT the JOB!
        if (taskQueue.isShutdown()) {
            LOG.warn("Task was cancelled");
            return 5;
        }
        rj = jc.submitJob(job);
        if (taskQueue.isShutdown()) {
            LOG.warn("Task was cancelled");
            killJob();
            return 5;
        }
        this.jobID = rj.getJobID();
        updateStatusInQueryDisplay();
        returnVal = jobExecHelper.progress(rj, jc, ctx);
        success = (returnVal == 0);
    } catch (Exception e) {
        setException(e);
        String mesg = " with exception '" + Utilities.getNameMessage(e) + "'";
        if (rj != null) {
            mesg = "Ended Job = " + rj.getJobID() + mesg;
        } else {
            mesg = "Job Submission failed" + mesg;
        }
        // Has to use full name to make sure it does not conflict with
        // org.apache.commons.lang3.StringUtils
        console.printError(mesg, "\n" + org.apache.hadoop.util.StringUtils.stringifyException(e));
        success = false;
        returnVal = 1;
    } finally {
        Utilities.clearWork(job);
        try {
            if (ctxCreated) {
                ctx.clear();
            }
            if (rj != null) {
                if (returnVal != 0) {
                    killJob();
                }
                jobID = rj.getID().toString();
            }
            if (jc != null) {
                jc.close();
            }
        } catch (Exception e) {
            LOG.warn("Failed while cleaning up ", e);
        } finally {
            HadoopJobExecHelper.runningJobs.remove(rj);
        }
    }
    // get the list of Dynamic partition paths
    try {
        if (rj != null) {
            if (mWork.getAliasToWork() != null) {
                for (Operator<? extends OperatorDesc> op : mWork.getAliasToWork().values()) {
                    op.jobClose(job, success);
                }
            }
            if (rWork != null) {
                rWork.getReducer().jobClose(job, success);
            }
        }
    } catch (Exception e) {
        // jobClose needs to execute successfully otherwise fail task
        if (success) {
            setException(e);
            success = false;
            returnVal = 3;
            String mesg = "Job Commit failed with exception '" + Utilities.getNameMessage(e) + "'";
            console.printError(mesg, "\n" + org.apache.hadoop.util.StringUtils.stringifyException(e));
        }
    }
    return (returnVal);
}
Also used : SessionState(org.apache.hadoop.hive.ql.session.SessionState) IOPrepareCache(org.apache.hadoop.hive.ql.io.IOPrepareCache) FileStatus(org.apache.hadoop.fs.FileStatus) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) JobClient(org.apache.hadoop.mapred.JobClient) StatsPublisher(org.apache.hadoop.hive.ql.stats.StatsPublisher) StatsFactory(org.apache.hadoop.hive.ql.stats.StatsFactory) FileSystem(org.apache.hadoop.fs.FileSystem) Path(org.apache.hadoop.fs.Path) StatsCollectionContext(org.apache.hadoop.hive.ql.stats.StatsCollectionContext) Context(org.apache.hadoop.hive.ql.Context) StatsCollectionContext(org.apache.hadoop.hive.ql.stats.StatsCollectionContext) BucketizedHiveInputFormat(org.apache.hadoop.hive.ql.io.BucketizedHiveInputFormat) ReduceWork(org.apache.hadoop.hive.ql.plan.ReduceWork) IOException(java.io.IOException) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) LogInitializationException(org.apache.hadoop.hive.common.LogUtils.LogInitializationException) IOException(java.io.IOException) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) MapredLocalWork(org.apache.hadoop.hive.ql.plan.MapredLocalWork)

Example 3 with MapredLocalWork

use of org.apache.hadoop.hive.ql.plan.MapredLocalWork in project hive by apache.

the class MapredLocalTask method executeInChildVM.

private int executeInChildVM() {
    // execute in child jvm
    try {
        // generate the cmd line to run in the child jvm
        String hiveJar = conf.getJar();
        String hadoopExec = conf.getVar(HiveConf.ConfVars.HADOOPBIN);
        conf.setVar(ConfVars.HIVEADDEDJARS, Utilities.getResourceFiles(conf, SessionState.ResourceType.JAR));
        // write out the plan to a local file
        Path planPath = new Path(context.getLocalTmpPath(), "plan.xml");
        MapredLocalWork plan = getWork();
        LOG.info("Generating plan file " + planPath.toString());
        OutputStream out = null;
        try {
            out = FileSystem.getLocal(conf).create(planPath);
            SerializationUtilities.serializePlan(plan, out);
            out.close();
            out = null;
        } finally {
            IOUtils.closeQuietly(out);
        }
        String isSilent = "true".equalsIgnoreCase(System.getProperty("test.silent")) ? "-nolog" : "";
        String libJars = ExecDriver.getResource(conf, ResourceType.JAR);
        String libJarsOption = StringUtils.isEmpty(libJars) ? " " : " -libjars " + libJars + " ";
        String jarCmd = hiveJar + " " + ExecDriver.class.getName() + libJarsOption;
        String hiveConfArgs = ExecDriver.generateCmdLine(conf, context);
        String cmdLine = hadoopExec + " jar " + jarCmd + " -localtask -plan " + planPath.toString() + " " + isSilent + " " + hiveConfArgs;
        String workDir = (new File(".")).getCanonicalPath();
        String files = Utilities.getResourceFiles(conf, SessionState.ResourceType.FILE);
        if (!files.isEmpty()) {
            cmdLine = cmdLine + " -files " + files;
            workDir = context.getLocalTmpPath().toUri().getPath();
            if (!(new File(workDir)).mkdir()) {
                throw new IOException("Cannot create tmp working dir: " + workDir);
            }
            for (String f : StringUtils.split(files, ',')) {
                Path p = new Path(f);
                String target = p.toUri().getPath();
                String link = workDir + Path.SEPARATOR + p.getName();
                if (FileUtil.symLink(target, link) != 0) {
                    throw new IOException("Cannot link to added file: " + target + " from: " + link);
                }
            }
        }
        // Inherit Java system variables
        String hadoopOpts;
        StringBuilder sb = new StringBuilder();
        Properties p = System.getProperties();
        for (String element : HIVE_SYS_PROP) {
            if (p.containsKey(element)) {
                sb.append(" -D" + element + "=" + p.getProperty(element));
            }
        }
        hadoopOpts = sb.toString();
        // Inherit the environment variables
        String[] env;
        Map<String, String> variables = new HashMap<String, String>(System.getenv());
        // The user can specify the hadoop memory
        // if ("local".equals(conf.getVar(HiveConf.ConfVars.HADOOPJT))) {
        // if we are running in local mode - then the amount of memory used
        // by the child jvm can no longer default to the memory used by the
        // parent jvm
        // int hadoopMem = conf.getIntVar(HiveConf.ConfVars.HIVEHADOOPMAXMEM);
        int hadoopMem = conf.getIntVar(HiveConf.ConfVars.HIVEHADOOPMAXMEM);
        if (hadoopMem == 0) {
            // remove env var that would default child jvm to use parent's memory
            // as default. child jvm would use default memory for a hadoop client
            variables.remove(HADOOP_MEM_KEY);
        } else {
            // user specified the memory for local mode hadoop run
            console.printInfo(" set heap size\t" + hadoopMem + "MB");
            variables.put(HADOOP_MEM_KEY, String.valueOf(hadoopMem));
        }
        // } else {
        // nothing to do - we are not running in local mode - only submitting
        // the job via a child process. in this case it's appropriate that the
        // child jvm use the same memory as the parent jvm
        // }
        // Set HADOOP_USER_NAME env variable for child process, so that
        // it also runs with hadoop permissions for the user the job is running as
        // This will be used by hadoop only in unsecure(/non kerberos) mode
        String endUserName = Utils.getUGI().getShortUserName();
        LOG.debug("setting HADOOP_USER_NAME\t" + endUserName);
        variables.put("HADOOP_USER_NAME", endUserName);
        if (variables.containsKey(HADOOP_OPTS_KEY)) {
            variables.put(HADOOP_OPTS_KEY, variables.get(HADOOP_OPTS_KEY) + hadoopOpts);
        } else {
            variables.put(HADOOP_OPTS_KEY, hadoopOpts);
        }
        // Hiveserver2 using "-hiveconf hive.hadoop.classpath=%HIVE_LIB%". This is to combine path(s).
        if (HiveConf.getVar(conf, HiveConf.ConfVars.HIVE_HADOOP_CLASSPATH) != null) {
            if (variables.containsKey("HADOOP_CLASSPATH")) {
                variables.put("HADOOP_CLASSPATH", variables.get("HADOOP_CLASSPATH") + ";" + HiveConf.getVar(conf, HiveConf.ConfVars.HIVE_HADOOP_CLASSPATH));
            } else {
                variables.put("HADOOP_CLASSPATH", HiveConf.getVar(conf, HiveConf.ConfVars.HIVE_HADOOP_CLASSPATH));
            }
        }
        if (variables.containsKey(MapRedTask.HIVE_DEBUG_RECURSIVE)) {
            MapRedTask.configureDebugVariablesForChildJVM(variables);
        }
        if (UserGroupInformation.isSecurityEnabled() && UserGroupInformation.isLoginKeytabBased()) {
            // If kerberos security is enabled, and HS2 doAs is enabled,
            // then additional params need to be set so that the command is run as
            // intended user
            secureDoAs = new SecureCmdDoAs(conf);
            secureDoAs.addEnv(variables);
        }
        // have different settings from those of HiveServer2.
        if (variables.containsKey(HIVE_LOCAL_TASK_CHILD_OPTS_KEY)) {
            String childOpts = variables.get(HIVE_LOCAL_TASK_CHILD_OPTS_KEY);
            if (childOpts == null) {
                childOpts = "";
            }
            String clientOpts = variables.put(HADOOP_CLIENT_OPTS, childOpts);
            String tmp = variables.get(HADOOP_OPTS_KEY);
            if (tmp != null && !StringUtils.isBlank(clientOpts)) {
                tmp = tmp.replace(clientOpts, childOpts);
                variables.put(HADOOP_OPTS_KEY, tmp);
            }
        }
        env = new String[variables.size()];
        int pos = 0;
        for (Map.Entry<String, String> entry : variables.entrySet()) {
            String name = entry.getKey();
            String value = entry.getValue();
            env[pos++] = name + "=" + value;
            LOG.debug("Setting env: " + name + "=" + LogUtils.maskIfPassword(name, value));
        }
        LOG.info("Executing: " + cmdLine);
        // Run ExecDriver in another JVM
        executor = Runtime.getRuntime().exec(cmdLine, env, new File(workDir));
        final LogRedirector.LogSourceCallback callback = () -> {
            return executor.isAlive();
        };
        LogRedirector.redirect(Thread.currentThread().getName() + "-LocalTask-" + getName() + "-stdout", new LogRedirector(executor.getInputStream(), LOG, callback));
        LogRedirector.redirect(Thread.currentThread().getName() + "-LocalTask-" + getName() + "-stderr", new LogRedirector(executor.getErrorStream(), LOG, callback));
        CachingPrintStream errPrintStream = new CachingPrintStream(System.err);
        StreamPrinter outPrinter = new StreamPrinter(executor.getInputStream(), null, System.out);
        StreamPrinter errPrinter = new StreamPrinter(executor.getErrorStream(), null, errPrintStream);
        outPrinter.start();
        errPrinter.start();
        int exitVal = jobExecHelper.progressLocal(executor, getId());
        // wait for stream threads to finish
        outPrinter.join();
        errPrinter.join();
        if (exitVal != 0) {
            LOG.error("Execution failed with exit status: " + exitVal);
            if (SessionState.get() != null) {
                SessionState.get().addLocalMapRedErrors(getId(), errPrintStream.getOutput());
            }
        } else {
            LOG.info("Execution completed successfully");
        }
        return exitVal;
    } catch (Exception e) {
        LOG.error("Exception: ", e);
        return (1);
    } finally {
        if (secureDoAs != null) {
            secureDoAs.close();
        }
    }
}
Also used : Path(org.apache.hadoop.fs.Path) LogRedirector(org.apache.hadoop.hive.common.log.LogRedirector) HashMap(java.util.HashMap) SecureCmdDoAs(org.apache.hadoop.hive.ql.exec.SecureCmdDoAs) OutputStream(java.io.OutputStream) IOException(java.io.IOException) Properties(java.util.Properties) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) IOException(java.io.IOException) CachingPrintStream(org.apache.hadoop.hive.common.io.CachingPrintStream) StreamPrinter(org.apache.hive.common.util.StreamPrinter) MapredLocalWork(org.apache.hadoop.hive.ql.plan.MapredLocalWork) File(java.io.File) Map(java.util.Map) HashMap(java.util.HashMap)

Example 4 with MapredLocalWork

use of org.apache.hadoop.hive.ql.plan.MapredLocalWork in project hive by apache.

the class SparkHashTableSinkOperator method flushToFile.

protected void flushToFile(MapJoinPersistableTableContainer tableContainer, byte tag) throws Exception {
    MapredLocalWork localWork = getExecContext().getLocalWork();
    BucketMapJoinContext mapJoinCtx = localWork.getBucketMapjoinContext();
    Path inputPath = getExecContext().getCurrentInputPath();
    String bigInputPath = null;
    if (inputPath != null && mapJoinCtx != null) {
        Set<String> aliases = ((SparkBucketMapJoinContext) mapJoinCtx).getPosToAliasMap().get((int) tag);
        bigInputPath = mapJoinCtx.getMappingBigFile(aliases.iterator().next(), inputPath.toString());
    }
    // get tmp file URI
    Path tmpURI = localWork.getTmpHDFSPath();
    LOG.info("Temp URI for side table: " + tmpURI);
    // get current bucket file name
    String fileName = localWork.getBucketFileName(bigInputPath);
    // get the tmp URI path; it will be a hdfs path if not local mode
    String dumpFilePrefix = conf.getDumpFilePrefix();
    Path path = Utilities.generatePath(tmpURI, dumpFilePrefix, tag, fileName);
    FileSystem fs = path.getFileSystem(htsOperator.getConfiguration());
    // Create the folder and its parents if not there
    fs.mkdirs(path);
    while (true) {
        path = new Path(path, getOperatorId() + "-" + Math.abs(ThreadLocalRandom.current().nextInt()));
        try {
            // This will guarantee file name uniqueness.
            if (fs.createNewFile(path)) {
                break;
            }
        } catch (FileExistsException e) {
        // No problem, use a new name
        }
    }
    htsOperator.console.printInfo(Utilities.now() + "\tDump the side-table for tag: " + tag + " with group count: " + tableContainer.size() + " into file: " + path);
    try {
        // get the hashtable file and path
        OutputStream os = null;
        ObjectOutputStream out = null;
        MapJoinTableContainerSerDe mapJoinTableSerde = htsOperator.mapJoinTableSerdes[tag];
        try {
            os = fs.create(path, numReplication);
            out = new ObjectOutputStream(new BufferedOutputStream(os, 4096));
            mapJoinTableSerde.persist(out, tableContainer);
        } finally {
            if (out != null) {
                out.close();
            } else if (os != null) {
                os.close();
            }
        }
        FileStatus status = fs.getFileStatus(path);
        htsOperator.console.printInfo(Utilities.now() + "\tUploaded 1 File to: " + path + " (" + status.getLen() + " bytes)");
    } catch (Exception e) {
        // Failed to dump the side-table, remove the partial file
        try {
            fs.delete(path, false);
        } catch (Exception ex) {
            LOG.warn("Got exception in deleting partial side-table dump for tag: " + tag + ", file " + path, ex);
        }
        throw e;
    }
    tableContainer.clear();
}
Also used : Path(org.apache.hadoop.fs.Path) FileStatus(org.apache.hadoop.fs.FileStatus) BufferedOutputStream(java.io.BufferedOutputStream) ObjectOutputStream(java.io.ObjectOutputStream) OutputStream(java.io.OutputStream) ObjectOutputStream(java.io.ObjectOutputStream) FileExistsException(org.apache.commons.io.FileExistsException) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) MapJoinTableContainerSerDe(org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainerSerDe) SparkBucketMapJoinContext(org.apache.hadoop.hive.ql.plan.SparkBucketMapJoinContext) BucketMapJoinContext(org.apache.hadoop.hive.ql.plan.BucketMapJoinContext) FileSystem(org.apache.hadoop.fs.FileSystem) MapredLocalWork(org.apache.hadoop.hive.ql.plan.MapredLocalWork) BufferedOutputStream(java.io.BufferedOutputStream) FileExistsException(org.apache.commons.io.FileExistsException)

Example 5 with MapredLocalWork

use of org.apache.hadoop.hive.ql.plan.MapredLocalWork in project hive by apache.

the class GenMapRedUtils method setMapWork.

/**
 * initialize MapWork
 *
 * @param alias_id
 *          current alias
 * @param plan
 *          map work to initialize
 * @param local
 *          whether you need to add to map-reduce or local work
 * @param partsList
 *          pruned partition list. If it is null it will be computed on-the-fly.
 * @param inputs
 *          read entities for the map work
 * @param conf
 *          current instance of hive conf
 */
public static void setMapWork(MapWork plan, ParseContext parseCtx, Set<ReadEntity> inputs, PrunedPartitionList partsList, TableScanOperator tsOp, String alias_id, HiveConf conf, boolean local) throws SemanticException {
    ArrayList<Path> partDir = new ArrayList<Path>();
    ArrayList<PartitionDesc> partDesc = new ArrayList<PartitionDesc>();
    boolean isFullAcidTable = false;
    Path tblDir = null;
    plan.setNameToSplitSample(parseCtx.getNameToSplitSample());
    // we also collect table stats while collecting column stats.
    if (parseCtx.getAnalyzeRewrite() != null) {
        plan.setGatheringStats(true);
    }
    if (partsList == null) {
        try {
            partsList = PartitionPruner.prune(tsOp, parseCtx, alias_id);
            isFullAcidTable = tsOp.getConf().isFullAcidTable();
        } catch (SemanticException e) {
            throw e;
        }
    }
    // Generate the map work for this alias_id
    // pass both confirmed and unknown partitions through the map-reduce
    // framework
    Set<Partition> parts = partsList.getPartitions();
    TableDesc tableSpec = Utilities.getTableDesc(tsOp.getConf().getTableMetadata());
    PartitionDesc aliasPartnDesc = null;
    try {
        if (!parts.isEmpty()) {
            aliasPartnDesc = Utilities.getPartitionDesc(parts.iterator().next(), tableSpec);
        }
    } catch (HiveException e) {
        LOG.error("Failed getPartitionDesc", e);
        throw new SemanticException(e.getMessage(), e);
    }
    // The table does not have any partitions
    if (aliasPartnDesc == null) {
        aliasPartnDesc = new PartitionDesc(tableSpec, null);
    }
    Map<String, String> props = tsOp.getConf().getOpProps();
    if (props != null) {
        Properties target = aliasPartnDesc.getProperties();
        target.putAll(props);
    }
    plan.getAliasToPartnInfo().put(alias_id, aliasPartnDesc);
    long sizeNeeded = Integer.MAX_VALUE;
    int fileLimit = -1;
    if (parseCtx.getGlobalLimitCtx().isEnable()) {
        if (isFullAcidTable) {
            LOG.info("Skipping Global Limit optimization for an ACID table");
            parseCtx.getGlobalLimitCtx().disableOpt();
        } else {
            long sizePerRow = HiveConf.getLongVar(parseCtx.getConf(), HiveConf.ConfVars.HIVELIMITMAXROWSIZE);
            sizeNeeded = (parseCtx.getGlobalLimitCtx().getGlobalOffset() + parseCtx.getGlobalLimitCtx().getGlobalLimit()) * sizePerRow;
            // for the optimization that reduce number of input file, we limit number
            // of files allowed. If more than specific number of files have to be
            // selected, we skip this optimization. Since having too many files as
            // inputs can cause unpredictable latency. It's not necessarily to be
            // cheaper.
            fileLimit = HiveConf.getIntVar(parseCtx.getConf(), HiveConf.ConfVars.HIVELIMITOPTLIMITFILE);
            if (sizePerRow <= 0 || fileLimit <= 0) {
                LOG.info("Skip optimization to reduce input size of 'limit'");
                parseCtx.getGlobalLimitCtx().disableOpt();
            } else if (parts.isEmpty()) {
                LOG.info("Empty input: skip limit optimization");
            } else {
                LOG.info("Try to reduce input size for 'limit' " + "sizeNeeded: " + sizeNeeded + "  file limit : " + fileLimit);
            }
        }
    }
    boolean isFirstPart = true;
    boolean emptyInput = true;
    boolean singlePartition = (parts.size() == 1);
    // Track the dependencies for the view. Consider a query like: select * from V;
    // where V is a view of the form: select * from T
    // The dependencies should include V at depth 0, and T at depth 1 (inferred).
    Map<String, ReadEntity> viewToInput = parseCtx.getViewAliasToInput();
    ReadEntity parentViewInfo = PlanUtils.getParentViewInfo(alias_id, viewToInput);
    // The table should also be considered a part of inputs, even if the table is a
    // partitioned table and whether any partition is selected or not
    // This read entity is a direct read entity and not an indirect read (that is when
    // this is being read because it is a dependency of a view).
    boolean isDirectRead = (parentViewInfo == null);
    TableDesc tblDesc = null;
    boolean initTableDesc = false;
    PlanUtils.addPartitionInputs(parts, inputs, parentViewInfo, isDirectRead);
    for (Partition part : parts) {
        // Later the properties have to come from the partition as opposed
        // to from the table in order to support versioning.
        Path[] paths = null;
        SampleDesc sampleDescr = parseCtx.getOpToSamplePruner().get(tsOp);
        // Lookup list bucketing pruner
        Map<String, ExprNodeDesc> partToPruner = parseCtx.getOpToPartToSkewedPruner().get(tsOp);
        ExprNodeDesc listBucketingPruner = (partToPruner != null) ? partToPruner.get(part.getName()) : null;
        if (sampleDescr != null) {
            assert (listBucketingPruner == null) : "Sampling and list bucketing can't coexit.";
            paths = SamplePruner.prune(part, sampleDescr);
            parseCtx.getGlobalLimitCtx().disableOpt();
        } else if (listBucketingPruner != null) {
            assert (sampleDescr == null) : "Sampling and list bucketing can't coexist.";
            /* Use list bucketing prunner's path. */
            paths = ListBucketingPruner.prune(parseCtx, part, listBucketingPruner);
        } else {
            // contain enough size, we change to normal mode.
            if (parseCtx.getGlobalLimitCtx().isEnable()) {
                if (isFirstPart) {
                    long sizeLeft = sizeNeeded;
                    ArrayList<Path> retPathList = new ArrayList<Path>();
                    SamplePruner.LimitPruneRetStatus status = SamplePruner.limitPrune(part, sizeLeft, fileLimit, retPathList);
                    if (status.equals(SamplePruner.LimitPruneRetStatus.NoFile)) {
                        continue;
                    } else if (status.equals(SamplePruner.LimitPruneRetStatus.NotQualify)) {
                        LOG.info("Use full input -- first " + fileLimit + " files are more than " + sizeNeeded + " bytes");
                        parseCtx.getGlobalLimitCtx().disableOpt();
                    } else {
                        emptyInput = false;
                        paths = new Path[retPathList.size()];
                        int index = 0;
                        for (Path path : retPathList) {
                            paths[index++] = path;
                        }
                        if (status.equals(SamplePruner.LimitPruneRetStatus.NeedAllFiles) && singlePartition) {
                            // if all files are needed to meet the size limit, we disable
                            // optimization. It usually happens for empty table/partition or
                            // table/partition with only one file. By disabling this
                            // optimization, we can avoid retrying the query if there is
                            // not sufficient rows.
                            parseCtx.getGlobalLimitCtx().disableOpt();
                        }
                    }
                    isFirstPart = false;
                } else {
                    paths = new Path[0];
                }
            }
            if (!parseCtx.getGlobalLimitCtx().isEnable()) {
                paths = part.getPath();
            }
        }
        // is it a partitioned table ?
        if (!part.getTable().isPartitioned()) {
            assert (tblDir == null);
            tblDir = paths[0];
            if (!initTableDesc) {
                tblDesc = Utilities.getTableDesc(part.getTable());
                initTableDesc = true;
            }
        } else if (tblDesc == null) {
            if (!initTableDesc) {
                tblDesc = Utilities.getTableDesc(part.getTable());
                initTableDesc = true;
            }
        }
        if (props != null) {
            Properties target = tblDesc.getProperties();
            target.putAll(props);
        }
        for (Path p : paths) {
            if (p == null) {
                continue;
            }
            LOG.debug("Adding {} of table {}", p, alias_id);
            partDir.add(p);
            try {
                if (part.getTable().isPartitioned()) {
                    partDesc.add(Utilities.getPartitionDesc(part, tblDesc));
                } else {
                    partDesc.add(Utilities.getPartitionDescFromTableDesc(tblDesc, part, false));
                }
            } catch (HiveException e) {
                LOG.error("Failed to add partition description", e);
                throw new SemanticException(e.getMessage(), e);
            }
        }
    }
    if (emptyInput) {
        parseCtx.getGlobalLimitCtx().disableOpt();
    }
    Utilities.addSchemaEvolutionToTableScanOperator(partsList.getSourceTable(), tsOp);
    Iterator<Path> iterPath = partDir.iterator();
    Iterator<PartitionDesc> iterPartnDesc = partDesc.iterator();
    if (!local) {
        while (iterPath.hasNext()) {
            assert iterPartnDesc.hasNext();
            Path path = iterPath.next();
            PartitionDesc prtDesc = iterPartnDesc.next();
            // Add the path to alias mapping
            plan.addPathToAlias(path, alias_id);
            plan.addPathToPartitionInfo(path, prtDesc);
            if (LOG.isDebugEnabled()) {
                LOG.debug("Information added for path " + path);
            }
        }
        assert plan.getAliasToWork().get(alias_id) == null;
        plan.getAliasToWork().put(alias_id, tsOp);
    } else {
        // populate local work if needed
        MapredLocalWork localPlan = plan.getMapRedLocalWork();
        if (localPlan == null) {
            localPlan = new MapredLocalWork(new LinkedHashMap<String, Operator<? extends OperatorDesc>>(), new LinkedHashMap<String, FetchWork>());
        }
        assert localPlan.getAliasToWork().get(alias_id) == null;
        assert localPlan.getAliasToFetchWork().get(alias_id) == null;
        localPlan.getAliasToWork().put(alias_id, tsOp);
        if (tblDir == null) {
            tblDesc = Utilities.getTableDesc(partsList.getSourceTable());
            localPlan.getAliasToFetchWork().put(alias_id, new FetchWork(partDir, partDesc, tblDesc));
        } else {
            localPlan.getAliasToFetchWork().put(alias_id, new FetchWork(tblDir, tblDesc));
        }
        plan.setMapRedLocalWork(localPlan);
    }
}
Also used : HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) ArrayList(java.util.ArrayList) Properties(java.util.Properties) LinkedHashMap(java.util.LinkedHashMap) ExprNodeDesc(org.apache.hadoop.hive.ql.plan.ExprNodeDesc) SemanticException(org.apache.hadoop.hive.ql.parse.SemanticException) Path(org.apache.hadoop.fs.Path) Partition(org.apache.hadoop.hive.ql.metadata.Partition) SampleDesc(org.apache.hadoop.hive.ql.plan.FilterDesc.SampleDesc) ReadEntity(org.apache.hadoop.hive.ql.hooks.ReadEntity) MapredLocalWork(org.apache.hadoop.hive.ql.plan.MapredLocalWork) FetchWork(org.apache.hadoop.hive.ql.plan.FetchWork) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) LoadTableDesc(org.apache.hadoop.hive.ql.plan.LoadTableDesc) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc)

Aggregations

MapredLocalWork (org.apache.hadoop.hive.ql.plan.MapredLocalWork)18 Path (org.apache.hadoop.fs.Path)15 HiveException (org.apache.hadoop.hive.ql.metadata.HiveException)10 Map (java.util.Map)7 Operator (org.apache.hadoop.hive.ql.exec.Operator)7 OperatorDesc (org.apache.hadoop.hive.ql.plan.OperatorDesc)7 ArrayList (java.util.ArrayList)6 HashMap (java.util.HashMap)6 LinkedHashMap (java.util.LinkedHashMap)6 MapJoinOperator (org.apache.hadoop.hive.ql.exec.MapJoinOperator)6 FetchWork (org.apache.hadoop.hive.ql.plan.FetchWork)6 PartitionDesc (org.apache.hadoop.hive.ql.plan.PartitionDesc)6 IOException (java.io.IOException)5 List (java.util.List)5 FileSystem (org.apache.hadoop.fs.FileSystem)5 MapWork (org.apache.hadoop.hive.ql.plan.MapWork)5 CompilationOpContext (org.apache.hadoop.hive.ql.CompilationOpContext)4 JoinOperator (org.apache.hadoop.hive.ql.exec.JoinOperator)4 TableDesc (org.apache.hadoop.hive.ql.plan.TableDesc)4 OutputStream (java.io.OutputStream)3