Search in sources :

Example 1 with MergeFileWork

use of org.apache.hadoop.hive.ql.io.merge.MergeFileWork in project hive by apache.

the class SparkPlanGenerator method generate.

private SparkTran generate(BaseWork work, SparkWork sparkWork) throws Exception {
    initStatsPublisher(work);
    JobConf newJobConf = cloneJobConf(work);
    checkSpecs(work, newJobConf);
    byte[] confBytes = KryoSerializer.serializeJobConf(newJobConf);
    boolean caching = isCachingWork(work, sparkWork);
    if (work instanceof MapWork) {
        // Create tmp dir for MergeFileWork
        if (work instanceof MergeFileWork) {
            Path outputPath = ((MergeFileWork) work).getOutputDir();
            Path tempOutPath = Utilities.toTempPath(outputPath);
            FileSystem fs = outputPath.getFileSystem(jobConf);
            try {
                if (!fs.exists(tempOutPath)) {
                    fs.mkdirs(tempOutPath);
                }
            } catch (IOException e) {
                throw new RuntimeException("Can't make path " + outputPath + " : " + e.getMessage());
            }
        }
        MapTran mapTran = new MapTran(caching);
        HiveMapFunction mapFunc = new HiveMapFunction(confBytes, sparkReporter);
        mapTran.setMapFunction(mapFunc);
        return mapTran;
    } else if (work instanceof ReduceWork) {
        ReduceTran reduceTran = new ReduceTran(caching);
        HiveReduceFunction reduceFunc = new HiveReduceFunction(confBytes, sparkReporter);
        reduceTran.setReduceFunction(reduceFunc);
        return reduceTran;
    } else {
        throw new IllegalStateException("AssertionError: expected either MapWork or ReduceWork, " + "but found " + work.getClass().getName());
    }
}
Also used : Path(org.apache.hadoop.fs.Path) MergeFileWork(org.apache.hadoop.hive.ql.io.merge.MergeFileWork) IOException(java.io.IOException) ReduceWork(org.apache.hadoop.hive.ql.plan.ReduceWork) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) FileSystem(org.apache.hadoop.fs.FileSystem) JobConf(org.apache.hadoop.mapred.JobConf)

Example 2 with MergeFileWork

use of org.apache.hadoop.hive.ql.io.merge.MergeFileWork in project hive by apache.

the class SparkPlanGenerator method cloneJobConf.

@SuppressWarnings({ "unchecked" })
private JobConf cloneJobConf(BaseWork work) throws Exception {
    if (workToJobConf.containsKey(work)) {
        return workToJobConf.get(work);
    }
    JobConf cloned = new JobConf(jobConf);
    // Make sure we'll use a different plan path from the original one
    HiveConf.setVar(cloned, HiveConf.ConfVars.PLAN, "");
    try {
        cloned.setPartitionerClass(JavaUtils.loadClass(HiveConf.getVar(cloned, HiveConf.ConfVars.HIVEPARTITIONER)));
    } catch (ClassNotFoundException e) {
        String msg = "Could not find partitioner class: " + e.getMessage() + " which is specified by: " + HiveConf.ConfVars.HIVEPARTITIONER.varname;
        throw new IllegalArgumentException(msg, e);
    }
    if (work instanceof MapWork) {
        cloned.setBoolean("mapred.task.is.map", true);
        List<Path> inputPaths = Utilities.getInputPaths(cloned, (MapWork) work, scratchDir, context, false);
        Utilities.setInputPaths(cloned, inputPaths);
        Utilities.setMapWork(cloned, (MapWork) work, scratchDir, false);
        Utilities.createTmpDirs(cloned, (MapWork) work);
        if (work instanceof MergeFileWork) {
            MergeFileWork mergeFileWork = (MergeFileWork) work;
            cloned.set(Utilities.MAPRED_MAPPER_CLASS, MergeFileMapper.class.getName());
            cloned.set("mapred.input.format.class", mergeFileWork.getInputformat());
            cloned.setClass("mapred.output.format.class", MergeFileOutputFormat.class, FileOutputFormat.class);
        } else {
            cloned.set(Utilities.MAPRED_MAPPER_CLASS, ExecMapper.class.getName());
        }
        if (((MapWork) work).getMinSplitSize() != null) {
            HiveConf.setLongVar(cloned, HiveConf.ConfVars.MAPREDMINSPLITSIZE, ((MapWork) work).getMinSplitSize());
        }
        // remember the JobConf cloned for each MapWork, so we won't clone for it again
        workToJobConf.put(work, cloned);
    } else if (work instanceof ReduceWork) {
        cloned.setBoolean("mapred.task.is.map", false);
        Utilities.setReduceWork(cloned, (ReduceWork) work, scratchDir, false);
        Utilities.createTmpDirs(cloned, (ReduceWork) work);
        cloned.set(Utilities.MAPRED_REDUCER_CLASS, ExecReducer.class.getName());
    }
    return cloned;
}
Also used : Path(org.apache.hadoop.fs.Path) MergeFileWork(org.apache.hadoop.hive.ql.io.merge.MergeFileWork) MergeFileMapper(org.apache.hadoop.hive.ql.io.merge.MergeFileMapper) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) ReduceWork(org.apache.hadoop.hive.ql.plan.ReduceWork) JobConf(org.apache.hadoop.mapred.JobConf) ExecMapper(org.apache.hadoop.hive.ql.exec.mr.ExecMapper)

Example 3 with MergeFileWork

use of org.apache.hadoop.hive.ql.io.merge.MergeFileWork in project hive by apache.

the class GenMapRedUtils method createMergeTask.

/**
   * Create a block level merge task for RCFiles or stripe level merge task for
   * ORCFiles
   *
   * @param fsInputDesc
   * @param finalName
   * @param ctx
   * @param inputFormatClass
   * @return MergeWork if table is stored as RCFile or ORCFile,
   *         null otherwise
   */
public static MapWork createMergeTask(FileSinkDesc fsInputDesc, Path finalName, boolean hasDynamicPartitions, CompilationOpContext ctx) throws SemanticException {
    Path inputDir = fsInputDesc.getFinalDirName();
    TableDesc tblDesc = fsInputDesc.getTableInfo();
    List<Path> inputDirs = new ArrayList<Path>(1);
    ArrayList<String> inputDirstr = new ArrayList<String>(1);
    // in case of dynamic partitioning and list bucketing
    if (!hasDynamicPartitions && !GenMapRedUtils.isSkewedStoredAsDirs(fsInputDesc)) {
        inputDirs.add(inputDir);
    }
    inputDirstr.add(inputDir.toString());
    // internal input format class for CombineHiveInputFormat
    final Class<? extends InputFormat> internalIFClass;
    if (tblDesc.getInputFileFormatClass().equals(RCFileInputFormat.class)) {
        internalIFClass = RCFileBlockMergeInputFormat.class;
    } else if (tblDesc.getInputFileFormatClass().equals(OrcInputFormat.class)) {
        internalIFClass = OrcFileStripeMergeInputFormat.class;
    } else {
        throw new SemanticException("createMergeTask called on a table with file" + " format other than RCFile or ORCFile");
    }
    // create the merge file work
    MergeFileWork work = new MergeFileWork(inputDirs, finalName, hasDynamicPartitions, tblDesc.getInputFileFormatClass().getName());
    LinkedHashMap<Path, ArrayList<String>> pathToAliases = new LinkedHashMap<>();
    pathToAliases.put(inputDir, inputDirstr);
    work.setMapperCannotSpanPartns(true);
    work.setPathToAliases(pathToAliases);
    PartitionDesc pDesc = new PartitionDesc(tblDesc, null);
    pDesc.setInputFileFormatClass(internalIFClass);
    work.addPathToPartitionInfo(inputDir, pDesc);
    work.setListBucketingCtx(fsInputDesc.getLbCtx());
    // create alias to work which contains the merge operator
    LinkedHashMap<String, Operator<? extends OperatorDesc>> aliasToWork = new LinkedHashMap<String, Operator<? extends OperatorDesc>>();
    Operator<? extends OperatorDesc> mergeOp = null;
    final FileMergeDesc fmd;
    if (tblDesc.getInputFileFormatClass().equals(RCFileInputFormat.class)) {
        fmd = new RCFileMergeDesc();
    } else {
        fmd = new OrcFileMergeDesc();
    }
    fmd.setDpCtx(fsInputDesc.getDynPartCtx());
    fmd.setOutputPath(finalName);
    fmd.setHasDynamicPartitions(work.hasDynamicPartitions());
    fmd.setListBucketingAlterTableConcatenate(work.isListBucketingAlterTableConcatenate());
    int lbLevel = work.getListBucketingCtx() == null ? 0 : work.getListBucketingCtx().calculateListBucketingLevel();
    fmd.setListBucketingDepth(lbLevel);
    mergeOp = OperatorFactory.get(ctx, fmd);
    aliasToWork.put(inputDir.toString(), mergeOp);
    work.setAliasToWork(aliasToWork);
    return work;
}
Also used : Path(org.apache.hadoop.fs.Path) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) DemuxOperator(org.apache.hadoop.hive.ql.exec.DemuxOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) MergeFileWork(org.apache.hadoop.hive.ql.io.merge.MergeFileWork) RCFileMergeDesc(org.apache.hadoop.hive.ql.plan.RCFileMergeDesc) OrcFileMergeDesc(org.apache.hadoop.hive.ql.plan.OrcFileMergeDesc) FileMergeDesc(org.apache.hadoop.hive.ql.plan.FileMergeDesc) RCFileMergeDesc(org.apache.hadoop.hive.ql.plan.RCFileMergeDesc) OrcFileMergeDesc(org.apache.hadoop.hive.ql.plan.OrcFileMergeDesc) ArrayList(java.util.ArrayList) OrcFileStripeMergeInputFormat(org.apache.hadoop.hive.ql.io.orc.OrcFileStripeMergeInputFormat) LinkedHashMap(java.util.LinkedHashMap) OrcInputFormat(org.apache.hadoop.hive.ql.io.orc.OrcInputFormat) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) LoadTableDesc(org.apache.hadoop.hive.ql.plan.LoadTableDesc) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc) SemanticException(org.apache.hadoop.hive.ql.parse.SemanticException)

Example 4 with MergeFileWork

use of org.apache.hadoop.hive.ql.io.merge.MergeFileWork in project hive by apache.

the class DDLTask method mergeFiles.

/**
   * First, make sure the source table/partition is not
   * archived/indexes/non-rcfile. If either of these is true, throw an
   * exception.
   *
   * The way how it does the merge is to create a BlockMergeTask from the
   * mergeFilesDesc.
   *
   * @param db
   * @param mergeFilesDesc
   * @return
   * @throws HiveException
   */
private int mergeFiles(Hive db, AlterTablePartMergeFilesDesc mergeFilesDesc, DriverContext driverContext) throws HiveException {
    ListBucketingCtx lbCtx = mergeFilesDesc.getLbCtx();
    boolean lbatc = lbCtx == null ? false : lbCtx.isSkewedStoredAsDir();
    int lbd = lbCtx == null ? 0 : lbCtx.calculateListBucketingLevel();
    // merge work only needs input and output.
    MergeFileWork mergeWork = new MergeFileWork(mergeFilesDesc.getInputDir(), mergeFilesDesc.getOutputDir(), mergeFilesDesc.getInputFormatClass().getName());
    LinkedHashMap<Path, ArrayList<String>> pathToAliases = new LinkedHashMap<>();
    ArrayList<String> inputDirstr = new ArrayList<String>(1);
    inputDirstr.add(mergeFilesDesc.getInputDir().toString());
    pathToAliases.put(mergeFilesDesc.getInputDir().get(0), inputDirstr);
    mergeWork.setPathToAliases(pathToAliases);
    mergeWork.setListBucketingCtx(mergeFilesDesc.getLbCtx());
    mergeWork.resolveConcatenateMerge(db.getConf());
    mergeWork.setMapperCannotSpanPartns(true);
    mergeWork.setSourceTableInputFormat(mergeFilesDesc.getInputFormatClass().getName());
    final FileMergeDesc fmd;
    if (mergeFilesDesc.getInputFormatClass().equals(RCFileInputFormat.class)) {
        fmd = new RCFileMergeDesc();
    } else {
        // safe to assume else is ORC as semantic analyzer will check for RC/ORC
        fmd = new OrcFileMergeDesc();
    }
    fmd.setDpCtx(null);
    fmd.setHasDynamicPartitions(false);
    fmd.setListBucketingAlterTableConcatenate(lbatc);
    fmd.setListBucketingDepth(lbd);
    fmd.setOutputPath(mergeFilesDesc.getOutputDir());
    CompilationOpContext opContext = driverContext.getCtx().getOpContext();
    Operator<? extends OperatorDesc> mergeOp = OperatorFactory.get(opContext, fmd);
    LinkedHashMap<String, Operator<? extends OperatorDesc>> aliasToWork = new LinkedHashMap<String, Operator<? extends OperatorDesc>>();
    aliasToWork.put(mergeFilesDesc.getInputDir().toString(), mergeOp);
    mergeWork.setAliasToWork(aliasToWork);
    DriverContext driverCxt = new DriverContext();
    Task task;
    if (conf.getVar(ConfVars.HIVE_EXECUTION_ENGINE).equals("tez")) {
        TezWork tezWork = new TezWork(conf.getVar(HiveConf.ConfVars.HIVEQUERYID), conf);
        mergeWork.setName("File Merge");
        tezWork.add(mergeWork);
        task = new TezTask();
        task.setWork(tezWork);
    } else {
        task = new MergeFileTask();
        task.setWork(mergeWork);
    }
    // initialize the task and execute
    task.initialize(queryState, getQueryPlan(), driverCxt, opContext);
    subtask = task;
    int ret = task.execute(driverCxt);
    if (subtask.getException() != null) {
        setException(subtask.getException());
    }
    return ret;
}
Also used : Path(org.apache.hadoop.fs.Path) DriverContext(org.apache.hadoop.hive.ql.DriverContext) MergeFileWork(org.apache.hadoop.hive.ql.io.merge.MergeFileWork) RCFileMergeDesc(org.apache.hadoop.hive.ql.plan.RCFileMergeDesc) ColumnTruncateTask(org.apache.hadoop.hive.ql.io.rcfile.truncate.ColumnTruncateTask) TezTask(org.apache.hadoop.hive.ql.exec.tez.TezTask) MergeFileTask(org.apache.hadoop.hive.ql.io.merge.MergeFileTask) OrcFileMergeDesc(org.apache.hadoop.hive.ql.plan.OrcFileMergeDesc) FileMergeDesc(org.apache.hadoop.hive.ql.plan.FileMergeDesc) RCFileMergeDesc(org.apache.hadoop.hive.ql.plan.RCFileMergeDesc) OrcFileMergeDesc(org.apache.hadoop.hive.ql.plan.OrcFileMergeDesc) ArrayList(java.util.ArrayList) TezTask(org.apache.hadoop.hive.ql.exec.tez.TezTask) LinkedHashMap(java.util.LinkedHashMap) CompilationOpContext(org.apache.hadoop.hive.ql.CompilationOpContext) ListBucketingCtx(org.apache.hadoop.hive.ql.plan.ListBucketingCtx) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc) TezWork(org.apache.hadoop.hive.ql.plan.TezWork) MergeFileTask(org.apache.hadoop.hive.ql.io.merge.MergeFileTask)

Example 5 with MergeFileWork

use of org.apache.hadoop.hive.ql.io.merge.MergeFileWork in project hive by apache.

the class MergeFileRecordProcessor method init.

@Override
void init(MRTaskReporter mrReporter, Map<String, LogicalInput> inputs, Map<String, LogicalOutput> outputs) throws Exception {
    // TODO HIVE-14042. Abort handling.
    perfLogger.PerfLogBegin(CLASS_NAME, PerfLogger.TEZ_INIT_OPERATORS);
    super.init(mrReporter, inputs, outputs);
    execContext = new ExecMapperContext(jconf);
    //Update JobConf using MRInput, info like filename comes via this
    mrInput = getMRInput(inputs);
    Configuration updatedConf = mrInput.getConfigUpdates();
    if (updatedConf != null) {
        for (Map.Entry<String, String> entry : updatedConf) {
            jconf.set(entry.getKey(), entry.getValue());
        }
    }
    createOutputMap();
    // Start all the Outputs.
    for (Map.Entry<String, LogicalOutput> outputEntry : outputs.entrySet()) {
        outputEntry.getValue().start();
        ((TezProcessor.TezKVOutputCollector) outMap.get(outputEntry.getKey())).initialize();
    }
    String queryId = HiveConf.getVar(jconf, HiveConf.ConfVars.HIVEQUERYID);
    cache = ObjectCacheFactory.getCache(jconf, queryId, true);
    try {
        execContext.setJc(jconf);
        cacheKey = MAP_PLAN_KEY;
        MapWork mapWork = (MapWork) cache.retrieve(cacheKey, new Callable<Object>() {

            @Override
            public Object call() {
                return Utilities.getMapWork(jconf);
            }
        });
        Utilities.setMapWork(jconf, mapWork);
        if (mapWork instanceof MergeFileWork) {
            mfWork = (MergeFileWork) mapWork;
        } else {
            throw new RuntimeException("MapWork should be an instance of MergeFileWork.");
        }
        String alias = mfWork.getAliasToWork().keySet().iterator().next();
        mergeOp = mfWork.getAliasToWork().get(alias);
        LOG.info(mergeOp.dump(0));
        MapredContext.init(true, new JobConf(jconf));
        ((TezContext) MapredContext.get()).setInputs(inputs);
        mergeOp.passExecContext(execContext);
        mergeOp.initializeLocalWork(jconf);
        mergeOp.initialize(jconf, null);
        OperatorUtils.setChildrenCollector(mergeOp.getChildOperators(), outMap);
        mergeOp.setReporter(reporter);
        MapredContext.get().setReporter(reporter);
    } catch (Throwable e) {
        if (e instanceof OutOfMemoryError) {
            // Don't create a new object if we are already out of memory
            throw (OutOfMemoryError) e;
        } else if (e instanceof InterruptedException) {
            l4j.info("Hit an interrupt while initializing MergeFileRecordProcessor. Message={}", e.getMessage());
            throw (InterruptedException) e;
        } else {
            throw new RuntimeException("Map operator initialization failed", e);
        }
    }
    perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.TEZ_INIT_OPERATORS);
}
Also used : ExecMapperContext(org.apache.hadoop.hive.ql.exec.mr.ExecMapperContext) MergeFileWork(org.apache.hadoop.hive.ql.io.merge.MergeFileWork) Configuration(org.apache.hadoop.conf.Configuration) LogicalOutput(org.apache.tez.runtime.api.LogicalOutput) Callable(java.util.concurrent.Callable) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) Map(java.util.Map) JobConf(org.apache.hadoop.mapred.JobConf)

Aggregations

MergeFileWork (org.apache.hadoop.hive.ql.io.merge.MergeFileWork)8 Path (org.apache.hadoop.fs.Path)5 MapWork (org.apache.hadoop.hive.ql.plan.MapWork)4 JobConf (org.apache.hadoop.mapred.JobConf)4 IOException (java.io.IOException)2 ArrayList (java.util.ArrayList)2 LinkedHashMap (java.util.LinkedHashMap)2 FileSystem (org.apache.hadoop.fs.FileSystem)2 ExecMapper (org.apache.hadoop.hive.ql.exec.mr.ExecMapper)2 MergeFileMapper (org.apache.hadoop.hive.ql.io.merge.MergeFileMapper)2 FileMergeDesc (org.apache.hadoop.hive.ql.plan.FileMergeDesc)2 OperatorDesc (org.apache.hadoop.hive.ql.plan.OperatorDesc)2 OrcFileMergeDesc (org.apache.hadoop.hive.ql.plan.OrcFileMergeDesc)2 RCFileMergeDesc (org.apache.hadoop.hive.ql.plan.RCFileMergeDesc)2 ReduceWork (org.apache.hadoop.hive.ql.plan.ReduceWork)2 HashMap (java.util.HashMap)1 Map (java.util.Map)1 Callable (java.util.concurrent.Callable)1 ConcurrentHashMap (java.util.concurrent.ConcurrentHashMap)1 Configuration (org.apache.hadoop.conf.Configuration)1