Search in sources :

Example 6 with MapRedTask

use of org.apache.hadoop.hive.ql.exec.mr.MapRedTask in project hive by apache.

the class Utilities method reworkMapRedWork.

/**
 * The check here is kind of not clean. It first use a for loop to go through
 * all input formats, and choose the ones that extend ReworkMapredInputFormat
 * to a set. And finally go through the ReworkMapredInputFormat set, and call
 * rework for each one.
 *
 * Technically all these can be avoided if all Hive's input formats can share
 * a same interface. As in today's hive and Hadoop, it is not possible because
 * a lot of Hive's input formats are in Hadoop's code. And most of Hadoop's
 * input formats just extend InputFormat interface.
 *
 * @param task
 * @param reworkMapredWork
 * @param conf
 * @throws SemanticException
 */
public static void reworkMapRedWork(Task<?> task, boolean reworkMapredWork, HiveConf conf) throws SemanticException {
    if (reworkMapredWork && (task instanceof MapRedTask)) {
        try {
            MapredWork mapredWork = ((MapRedTask) task).getWork();
            Set<Class<? extends InputFormat>> reworkInputFormats = new HashSet<Class<? extends InputFormat>>();
            for (PartitionDesc part : mapredWork.getMapWork().getPathToPartitionInfo().values()) {
                Class<? extends InputFormat> inputFormatCls = part.getInputFileFormatClass();
                if (ReworkMapredInputFormat.class.isAssignableFrom(inputFormatCls)) {
                    reworkInputFormats.add(inputFormatCls);
                }
            }
            if (reworkInputFormats.size() > 0) {
                for (Class<? extends InputFormat> inputFormatCls : reworkInputFormats) {
                    ReworkMapredInputFormat inst = (ReworkMapredInputFormat) ReflectionUtil.newInstance(inputFormatCls, null);
                    inst.rework(conf, mapredWork);
                }
            }
        } catch (IOException e) {
            throw new SemanticException(e);
        }
    }
}
Also used : MapRedTask(org.apache.hadoop.hive.ql.exec.mr.MapRedTask) MapredWork(org.apache.hadoop.hive.ql.plan.MapredWork) ReworkMapredInputFormat(org.apache.hadoop.hive.ql.io.ReworkMapredInputFormat) OneNullRowInputFormat(org.apache.hadoop.hive.ql.io.OneNullRowInputFormat) HiveInputFormat(org.apache.hadoop.hive.ql.io.HiveInputFormat) SequenceFileInputFormat(org.apache.hadoop.mapred.SequenceFileInputFormat) ContentSummaryInputFormat(org.apache.hadoop.hive.ql.io.ContentSummaryInputFormat) InputFormat(org.apache.hadoop.mapred.InputFormat) FileInputFormat(org.apache.hadoop.mapred.FileInputFormat) TextInputFormat(org.apache.hadoop.mapred.TextInputFormat) ReworkMapredInputFormat(org.apache.hadoop.hive.ql.io.ReworkMapredInputFormat) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) IOException(java.io.IOException) HashSet(java.util.HashSet) SemanticException(org.apache.hadoop.hive.ql.parse.SemanticException)

Example 7 with MapRedTask

use of org.apache.hadoop.hive.ql.exec.mr.MapRedTask in project hive by apache.

the class SortMergeJoinTaskDispatcher method convertSMBTaskToMapJoinTask.

// create map join task and set big table as bigTablePosition
private MapRedTask convertSMBTaskToMapJoinTask(MapredWork origWork, int bigTablePosition, SMBMapJoinOperator smbJoinOp) throws SemanticException {
    // deep copy a new mapred work
    MapredWork newWork = SerializationUtilities.clonePlan(origWork);
    // create a mapred task for this work
    MapRedTask newTask = (MapRedTask) TaskFactory.get(newWork);
    // generate the map join operator; already checked the map join
    MapJoinOperator newMapJoinOp = getMapJoinOperator(newTask, newWork, smbJoinOp, bigTablePosition);
    // The reducer needs to be restored - Consider a query like:
    // select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key;
    // The reducer contains a groupby, which needs to be restored.
    ReduceWork rWork = newWork.getReduceWork();
    // create the local work for this plan
    MapJoinProcessor.genLocalWorkForMapJoin(newWork, newMapJoinOp, bigTablePosition);
    // restore the reducer
    newWork.setReduceWork(rWork);
    return newTask;
}
Also used : MapRedTask(org.apache.hadoop.hive.ql.exec.mr.MapRedTask) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) MapredWork(org.apache.hadoop.hive.ql.plan.MapredWork) ReduceWork(org.apache.hadoop.hive.ql.plan.ReduceWork)

Example 8 with MapRedTask

use of org.apache.hadoop.hive.ql.exec.mr.MapRedTask in project hive by apache.

the class SortMergeJoinTaskDispatcher method processCurrentTask.

@Override
public Task<?> processCurrentTask(MapRedTask currTask, ConditionalTask conditionalTask, Context context) throws SemanticException {
    // whether it contains a sort merge join operator
    MapredWork currWork = currTask.getWork();
    SMBMapJoinOperator originalSMBJoinOp = getSMBMapJoinOp(currWork);
    if (!isEligibleForOptimization(originalSMBJoinOp)) {
        return null;
    }
    currTask.setTaskTag(Task.CONVERTED_SORTMERGEJOIN);
    // Convert the work containing to sort-merge join into a work, as if it had a regular join.
    // Note that the operator tree is not changed - is still contains the SMB join, but the
    // plan is changed (aliasToWork etc.) to contain all the paths as if it was a regular join.
    // This is used to convert the plan to a map-join, and then the original SMB join plan is used
    // as a backup task.
    MapredWork currJoinWork = convertSMBWorkToJoinWork(currWork, originalSMBJoinOp);
    SMBMapJoinOperator newSMBJoinOp = getSMBMapJoinOp(currJoinWork);
    currWork.getMapWork().setLeftInputJoin(originalSMBJoinOp.getConf().isLeftInputJoin());
    currWork.getMapWork().setBaseSrc(originalSMBJoinOp.getConf().getBaseSrc());
    currWork.getMapWork().setMapAliases(originalSMBJoinOp.getConf().getMapAliases());
    currJoinWork.getMapWork().setLeftInputJoin(originalSMBJoinOp.getConf().isLeftInputJoin());
    currJoinWork.getMapWork().setBaseSrc(originalSMBJoinOp.getConf().getBaseSrc());
    currJoinWork.getMapWork().setMapAliases(originalSMBJoinOp.getConf().getMapAliases());
    // create conditional work list and task list
    List<Serializable> listWorks = new ArrayList<Serializable>();
    List<Task<?>> listTasks = new ArrayList<Task<?>>();
    // create task to aliases mapping and alias to input file mapping for resolver
    // Must be deterministic order map for consistent q-test output across Java versions
    HashMap<Task<?>, Set<String>> taskToAliases = new LinkedHashMap<Task<?>, Set<String>>();
    // Note that pathToAlias will behave as if the original plan was a join plan
    Map<Path, List<String>> pathToAliases = currJoinWork.getMapWork().getPathToAliases();
    // generate a map join task for the big table
    SMBJoinDesc originalSMBJoinDesc = originalSMBJoinOp.getConf();
    Byte[] order = originalSMBJoinDesc.getTagOrder();
    int numAliases = order.length;
    Set<Integer> bigTableCandidates = MapJoinProcessor.getBigTableCandidates(originalSMBJoinDesc.getConds());
    HashMap<String, Long> aliasToSize = new HashMap<String, Long>();
    Configuration conf = context.getConf();
    try {
        long aliasTotalKnownInputSize = getTotalKnownInputSize(context, currJoinWork.getMapWork(), pathToAliases, aliasToSize);
        long ThresholdOfSmallTblSizeSum = HiveConf.getLongVar(conf, HiveConf.ConfVars.HIVESMALLTABLESFILESIZE);
        for (int bigTablePosition = 0; bigTablePosition < numAliases; bigTablePosition++) {
            // this table cannot be big table
            if (!bigTableCandidates.contains(bigTablePosition)) {
                continue;
            }
            // create map join task for the given big table position
            MapRedTask newTask = convertSMBTaskToMapJoinTask(currJoinWork, bigTablePosition, newSMBJoinOp);
            MapWork mapWork = newTask.getWork().getMapWork();
            Operator<?> parentOp = originalSMBJoinOp.getParentOperators().get(bigTablePosition);
            Set<String> aliases = GenMapRedUtils.findAliases(mapWork, parentOp);
            long aliasKnownSize = Utilities.sumOf(aliasToSize, aliases);
            if (aliasKnownSize > 0) {
                long smallTblTotalKnownSize = aliasTotalKnownInputSize - aliasKnownSize;
                if (smallTblTotalKnownSize > ThresholdOfSmallTblSizeSum) {
                    // this table is not good to be a big table.
                    continue;
                }
            }
            // add into conditional task
            listWorks.add(newTask.getWork());
            listTasks.add(newTask);
            newTask.setTaskTag(Task.CONVERTED_MAPJOIN);
            newTask.setFetchSource(currTask.isFetchSource());
            // set up backup task
            newTask.setBackupTask(currTask);
            newTask.setBackupChildrenTasks(currTask.getChildTasks());
            // put the mapping task to aliases
            taskToAliases.put(newTask, aliases);
        }
    } catch (Exception e) {
        throw new SemanticException("Generate Map Join Task Error", e);
    }
    // insert current common join task to conditional task
    listWorks.add(currTask.getWork());
    listTasks.add(currTask);
    // clear JoinTree and OP Parse Context
    currWork.getMapWork().setLeftInputJoin(false);
    currWork.getMapWork().setBaseSrc(null);
    currWork.getMapWork().setMapAliases(null);
    // create conditional task and insert conditional task into task tree
    ConditionalWork cndWork = new ConditionalWork(listWorks);
    ConditionalTask cndTsk = (ConditionalTask) TaskFactory.get(cndWork);
    cndTsk.setListTasks(listTasks);
    // set resolver and resolver context
    cndTsk.setResolver(new ConditionalResolverCommonJoin());
    ConditionalResolverCommonJoinCtx resolverCtx = new ConditionalResolverCommonJoinCtx();
    resolverCtx.setPathToAliases(pathToAliases);
    resolverCtx.setAliasToKnownSize(aliasToSize);
    resolverCtx.setTaskToAliases(taskToAliases);
    resolverCtx.setCommonJoinTask(currTask);
    resolverCtx.setLocalTmpDir(context.getLocalScratchDir(false));
    resolverCtx.setHdfsTmpDir(context.getMRScratchDir());
    cndTsk.setResolverCtx(resolverCtx);
    // replace the current task with the new generated conditional task
    replaceTaskWithConditionalTask(currTask, cndTsk);
    return cndTsk;
}
Also used : Serializable(java.io.Serializable) ConditionalTask(org.apache.hadoop.hive.ql.exec.ConditionalTask) Task(org.apache.hadoop.hive.ql.exec.Task) MapRedTask(org.apache.hadoop.hive.ql.exec.mr.MapRedTask) Set(java.util.Set) SMBJoinDesc(org.apache.hadoop.hive.ql.plan.SMBJoinDesc) Configuration(org.apache.hadoop.conf.Configuration) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) ArrayList(java.util.ArrayList) ConditionalWork(org.apache.hadoop.hive.ql.plan.ConditionalWork) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) LinkedHashMap(java.util.LinkedHashMap) MapRedTask(org.apache.hadoop.hive.ql.exec.mr.MapRedTask) ConditionalResolverCommonJoinCtx(org.apache.hadoop.hive.ql.plan.ConditionalResolverCommonJoin.ConditionalResolverCommonJoinCtx) MapredWork(org.apache.hadoop.hive.ql.plan.MapredWork) ConditionalTask(org.apache.hadoop.hive.ql.exec.ConditionalTask) ArrayList(java.util.ArrayList) List(java.util.List) ConditionalResolverCommonJoin(org.apache.hadoop.hive.ql.plan.ConditionalResolverCommonJoin) SemanticException(org.apache.hadoop.hive.ql.parse.SemanticException) Path(org.apache.hadoop.fs.Path) SemanticException(org.apache.hadoop.hive.ql.parse.SemanticException) MapWork(org.apache.hadoop.hive.ql.plan.MapWork)

Example 9 with MapRedTask

use of org.apache.hadoop.hive.ql.exec.mr.MapRedTask in project hive by apache.

the class TaskQueue method finished.

public void finished(TaskRunner runner) {
    if (statsTasks.isEmpty() || !(runner.getTask() instanceof MapRedTask)) {
        return;
    }
    MapRedTask mapredTask = (MapRedTask) runner.getTask();
    MapWork mapWork = mapredTask.getWork().getMapWork();
    ReduceWork reduceWork = mapredTask.getWork().getReduceWork();
    List<Operator> operators = new ArrayList<Operator>(mapWork.getAliasToWork().values());
    if (reduceWork != null) {
        operators.add(reduceWork.getReducer());
    }
    final List<String> statKeys = new ArrayList<String>(1);
    NodeUtils.iterate(operators, FileSinkOperator.class, new Function<FileSinkOperator>() {

        @Override
        public void apply(FileSinkOperator fsOp) {
            if (fsOp.getConf().isGatherStats()) {
                statKeys.add(fsOp.getConf().getStatsAggPrefix());
            }
        }
    });
    for (String statKey : statKeys) {
        if (statsTasks.containsKey(statKey)) {
            statsTasks.get(statKey).getWork().setSourceTask(mapredTask);
        } else {
            LOG.debug("There is no correspoing statTask for: " + statKey);
        }
    }
}
Also used : MapRedTask(org.apache.hadoop.hive.ql.exec.mr.MapRedTask) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) ArrayList(java.util.ArrayList) ReduceWork(org.apache.hadoop.hive.ql.plan.ReduceWork)

Example 10 with MapRedTask

use of org.apache.hadoop.hive.ql.exec.mr.MapRedTask in project hive by apache.

the class TestSymlinkTextInputFormat method testCombine.

/**
 * Test combine symlink text input file. Two input dir, and each contains one
 * file, and then create one symlink file containing these 2 files. Normally
 * without combine, it will return at least 2 splits
 */
@Test
public void testCombine() throws Exception {
    JobConf newJob = new JobConf(job);
    FileSystem fs = dataDir1.getFileSystem(newJob);
    Path dir1_file1 = new Path(dataDir1, "combinefile1_1");
    writeTextFile(dir1_file1, "dir1_file1_line1\n" + "dir1_file1_line2\n");
    Path dir2_file1 = new Path(dataDir2, "combinefile2_1");
    writeTextFile(dir2_file1, "dir2_file1_line1\n" + "dir2_file1_line2\n");
    // A symlink file, contains first file from first dir and second file from
    // second dir.
    writeSymlinkFile(new Path(symlinkDir, "symlink_file"), new Path(dataDir1, "combinefile1_1"), new Path(dataDir2, "combinefile2_1"));
    HiveConf hiveConf = new HiveConf(TestSymlinkTextInputFormat.class);
    hiveConf.setVar(HiveConf.ConfVars.HIVE_AUTHORIZATION_MANAGER, "org.apache.hadoop.hive.ql.security.authorization.plugin.sqlstd.SQLStdHiveAuthorizerFactory");
    HiveConf.setBoolVar(hiveConf, HiveConf.ConfVars.HIVE_REWORK_MAPREDWORK, true);
    HiveConf.setBoolVar(hiveConf, HiveConf.ConfVars.HIVE_SUPPORT_CONCURRENCY, false);
    Driver drv = new Driver(hiveConf);
    String tblName = "text_symlink_text";
    String createSymlinkTableCmd = "create table " + tblName + " (key int) stored as " + " inputformat 'org.apache.hadoop.hive.ql.io.SymlinkTextInputFormat' " + " outputformat 'org.apache.hadoop.hive.ql.io.IgnoreKeyTextOutputFormat'";
    SessionState.start(hiveConf);
    boolean tblCreated = false;
    try {
        int ecode = 0;
        drv.run(createSymlinkTableCmd);
        tblCreated = true;
        String loadFileCommand = "LOAD DATA LOCAL INPATH '" + new Path(symlinkDir, "symlink_file").toString() + "' INTO TABLE " + tblName;
        drv.run(loadFileCommand);
        String cmd = "select key*1 from " + tblName;
        ecode = drv.compile(cmd, true);
        if (ecode != 0) {
            throw new Exception("Select compile: " + cmd + " failed with exit code= " + ecode);
        }
        // create scratch dir
        Context ctx = new Context(newJob);
        Path emptyScratchDir = ctx.getMRTmpPath();
        FileSystem fileSys = emptyScratchDir.getFileSystem(newJob);
        fileSys.mkdirs(emptyScratchDir);
        QueryPlan plan = drv.getPlan();
        MapRedTask selectTask = (MapRedTask) plan.getRootTasks().get(0);
        List<Path> inputPaths = Utilities.getInputPaths(newJob, selectTask.getWork().getMapWork(), emptyScratchDir, ctx, false);
        Utilities.setInputPaths(newJob, inputPaths);
        Utilities.setMapRedWork(newJob, selectTask.getWork(), ctx.getMRTmpPath());
        CombineHiveInputFormat combineInputFormat = ReflectionUtils.newInstance(CombineHiveInputFormat.class, newJob);
        InputSplit[] retSplits = combineInputFormat.getSplits(newJob, 1);
        assertEquals(1, retSplits.length);
    } catch (Exception e) {
        e.printStackTrace();
        fail("Caught exception " + e);
    } finally {
        if (tblCreated) {
            drv.run("drop table text_symlink_text");
        }
    }
}
Also used : Path(org.apache.hadoop.fs.Path) Context(org.apache.hadoop.hive.ql.Context) Driver(org.apache.hadoop.hive.ql.Driver) QueryPlan(org.apache.hadoop.hive.ql.QueryPlan) IOException(java.io.IOException) MapRedTask(org.apache.hadoop.hive.ql.exec.mr.MapRedTask) FileSystem(org.apache.hadoop.fs.FileSystem) HiveConf(org.apache.hadoop.hive.conf.HiveConf) JobConf(org.apache.hadoop.mapred.JobConf) InputSplit(org.apache.hadoop.mapred.InputSplit) Test(org.junit.Test)

Aggregations

MapRedTask (org.apache.hadoop.hive.ql.exec.mr.MapRedTask)21 Task (org.apache.hadoop.hive.ql.exec.Task)9 MapredWork (org.apache.hadoop.hive.ql.plan.MapredWork)9 ArrayList (java.util.ArrayList)8 ConditionalTask (org.apache.hadoop.hive.ql.exec.ConditionalTask)8 SemanticException (org.apache.hadoop.hive.ql.parse.SemanticException)8 MapWork (org.apache.hadoop.hive.ql.plan.MapWork)8 TableScanOperator (org.apache.hadoop.hive.ql.exec.TableScanOperator)7 List (java.util.List)6 Operator (org.apache.hadoop.hive.ql.exec.Operator)5 Serializable (java.io.Serializable)4 Path (org.apache.hadoop.fs.Path)4 FileSinkOperator (org.apache.hadoop.hive.ql.exec.FileSinkOperator)4 IOException (java.io.IOException)3 HashSet (java.util.HashSet)3 JoinOperator (org.apache.hadoop.hive.ql.exec.JoinOperator)3 TezTask (org.apache.hadoop.hive.ql.exec.tez.TezTask)3 OperatorDesc (org.apache.hadoop.hive.ql.plan.OperatorDesc)3 ReduceWork (org.apache.hadoop.hive.ql.plan.ReduceWork)3 HashMap (java.util.HashMap)2