Search in sources :

Example 1 with MergeFileTask

use of org.apache.hadoop.hive.ql.io.merge.MergeFileTask in project hive by apache.

the class AlterTableConcatenateOperation method getTask.

private Task<?> getTask(MergeFileWork mergeWork) {
    if (context.getConf().getVar(ConfVars.HIVE_EXECUTION_ENGINE).equals("tez")) {
        TezWork tezWork = new TezWork(context.getQueryState().getQueryId(), context.getConf());
        mergeWork.setName("File Merge");
        tezWork.add(mergeWork);
        Task<?> task = new TezTask();
        ((TezTask) task).setWork(tezWork);
        return task;
    } else {
        Task<?> task = new MergeFileTask();
        ((MergeFileTask) task).setWork(mergeWork);
        return task;
    }
}
Also used : TezTask(org.apache.hadoop.hive.ql.exec.tez.TezTask) TezWork(org.apache.hadoop.hive.ql.plan.TezWork) MergeFileTask(org.apache.hadoop.hive.ql.io.merge.MergeFileTask)

Example 2 with MergeFileTask

use of org.apache.hadoop.hive.ql.io.merge.MergeFileTask in project hive by apache.

the class DDLTask method mergeFiles.

/**
 * First, make sure the source table/partition is not
 * archived/indexes/non-rcfile. If either of these is true, throw an
 * exception.
 *
 * The way how it does the merge is to create a BlockMergeTask from the
 * mergeFilesDesc.
 *
 * @param db
 * @param mergeFilesDesc
 * @return
 * @throws HiveException
 */
private int mergeFiles(Hive db, AlterTablePartMergeFilesDesc mergeFilesDesc, DriverContext driverContext) throws HiveException {
    ListBucketingCtx lbCtx = mergeFilesDesc.getLbCtx();
    boolean lbatc = lbCtx == null ? false : lbCtx.isSkewedStoredAsDir();
    int lbd = lbCtx == null ? 0 : lbCtx.calculateListBucketingLevel();
    // merge work only needs input and output.
    MergeFileWork mergeWork = new MergeFileWork(mergeFilesDesc.getInputDir(), mergeFilesDesc.getOutputDir(), mergeFilesDesc.getInputFormatClass().getName(), mergeFilesDesc.getTableDesc());
    LinkedHashMap<Path, ArrayList<String>> pathToAliases = new LinkedHashMap<>();
    ArrayList<String> inputDirstr = new ArrayList<String>(1);
    inputDirstr.add(mergeFilesDesc.getInputDir().toString());
    pathToAliases.put(mergeFilesDesc.getInputDir().get(0), inputDirstr);
    mergeWork.setPathToAliases(pathToAliases);
    mergeWork.setListBucketingCtx(mergeFilesDesc.getLbCtx());
    mergeWork.resolveConcatenateMerge(db.getConf());
    mergeWork.setMapperCannotSpanPartns(true);
    mergeWork.setSourceTableInputFormat(mergeFilesDesc.getInputFormatClass().getName());
    final FileMergeDesc fmd;
    if (mergeFilesDesc.getInputFormatClass().equals(RCFileInputFormat.class)) {
        fmd = new RCFileMergeDesc();
    } else {
        // safe to assume else is ORC as semantic analyzer will check for RC/ORC
        fmd = new OrcFileMergeDesc();
    }
    fmd.setDpCtx(null);
    fmd.setHasDynamicPartitions(false);
    fmd.setListBucketingAlterTableConcatenate(lbatc);
    fmd.setListBucketingDepth(lbd);
    fmd.setOutputPath(mergeFilesDesc.getOutputDir());
    CompilationOpContext opContext = driverContext.getCtx().getOpContext();
    Operator<? extends OperatorDesc> mergeOp = OperatorFactory.get(opContext, fmd);
    LinkedHashMap<String, Operator<? extends OperatorDesc>> aliasToWork = new LinkedHashMap<String, Operator<? extends OperatorDesc>>();
    aliasToWork.put(mergeFilesDesc.getInputDir().toString(), mergeOp);
    mergeWork.setAliasToWork(aliasToWork);
    DriverContext driverCxt = new DriverContext();
    Task<?> task;
    if (conf.getVar(ConfVars.HIVE_EXECUTION_ENGINE).equals("tez")) {
        TezWork tezWork = new TezWork(queryState.getQueryId(), conf);
        mergeWork.setName("File Merge");
        tezWork.add(mergeWork);
        task = new TezTask();
        ((TezTask) task).setWork(tezWork);
    } else {
        task = new MergeFileTask();
        ((MergeFileTask) task).setWork(mergeWork);
    }
    // initialize the task and execute
    task.initialize(queryState, getQueryPlan(), driverCxt, opContext);
    subtask = task;
    int ret = task.execute(driverCxt);
    if (subtask.getException() != null) {
        setException(subtask.getException());
    }
    return ret;
}
Also used : Path(org.apache.hadoop.fs.Path) DriverContext(org.apache.hadoop.hive.ql.DriverContext) MergeFileWork(org.apache.hadoop.hive.ql.io.merge.MergeFileWork) RCFileMergeDesc(org.apache.hadoop.hive.ql.plan.RCFileMergeDesc) OrcFileMergeDesc(org.apache.hadoop.hive.ql.plan.OrcFileMergeDesc) FileMergeDesc(org.apache.hadoop.hive.ql.plan.FileMergeDesc) RCFileMergeDesc(org.apache.hadoop.hive.ql.plan.RCFileMergeDesc) OrcFileMergeDesc(org.apache.hadoop.hive.ql.plan.OrcFileMergeDesc) ArrayList(java.util.ArrayList) TezTask(org.apache.hadoop.hive.ql.exec.tez.TezTask) SQLUniqueConstraint(org.apache.hadoop.hive.metastore.api.SQLUniqueConstraint) CheckConstraint(org.apache.hadoop.hive.ql.metadata.CheckConstraint) NotNullConstraint(org.apache.hadoop.hive.ql.metadata.NotNullConstraint) SQLCheckConstraint(org.apache.hadoop.hive.metastore.api.SQLCheckConstraint) SQLDefaultConstraint(org.apache.hadoop.hive.metastore.api.SQLDefaultConstraint) DefaultConstraint(org.apache.hadoop.hive.ql.metadata.DefaultConstraint) UniqueConstraint(org.apache.hadoop.hive.ql.metadata.UniqueConstraint) SQLNotNullConstraint(org.apache.hadoop.hive.metastore.api.SQLNotNullConstraint) LinkedHashMap(java.util.LinkedHashMap) CompilationOpContext(org.apache.hadoop.hive.ql.CompilationOpContext) ListBucketingCtx(org.apache.hadoop.hive.ql.plan.ListBucketingCtx) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc) TezWork(org.apache.hadoop.hive.ql.plan.TezWork) MergeFileTask(org.apache.hadoop.hive.ql.io.merge.MergeFileTask)

Example 3 with MergeFileTask

use of org.apache.hadoop.hive.ql.io.merge.MergeFileTask in project hive by apache.

the class MoveTask method inferTaskInformation.

private void inferTaskInformation(TaskInformation ti) {
    // (Either standard, local, or a merge)
    while (ti.task.getParentTasks() != null && ti.task.getParentTasks().size() == 1) {
        ti.task = (Task) ti.task.getParentTasks().get(0);
        // If it was a merge task or a local map reduce task, nothing can be inferred
        if (ti.task instanceof MergeFileTask || ti.task instanceof MapredLocalTask) {
            break;
        }
        // the directory this move task is moving
        if (ti.task instanceof MapRedTask) {
            MapredWork work = (MapredWork) ti.task.getWork();
            MapWork mapWork = work.getMapWork();
            ti.bucketCols = mapWork.getBucketedColsByDirectory().get(ti.path);
            ti.sortCols = mapWork.getSortedColsByDirectory().get(ti.path);
            if (work.getReduceWork() != null) {
                ti.numBuckets = work.getReduceWork().getNumReduceTasks();
            }
            if (ti.bucketCols != null || ti.sortCols != null) {
                // operator that writes the final output)
                assert work.isFinalMapRed();
            }
            break;
        }
        // condition for merging is not met, see GenMRFileSink1.
        if (ti.task instanceof MoveTask) {
            MoveTask mt = (MoveTask) ti.task;
            if (mt.getWork().getLoadFileWork() != null) {
                ti.path = mt.getWork().getLoadFileWork().getSourcePath().toUri().toString();
            }
        }
    }
}
Also used : MapRedTask(org.apache.hadoop.hive.ql.exec.mr.MapRedTask) MapredWork(org.apache.hadoop.hive.ql.plan.MapredWork) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) MapredLocalTask(org.apache.hadoop.hive.ql.exec.mr.MapredLocalTask) MergeFileTask(org.apache.hadoop.hive.ql.io.merge.MergeFileTask)

Aggregations

MergeFileTask (org.apache.hadoop.hive.ql.io.merge.MergeFileTask)3 TezTask (org.apache.hadoop.hive.ql.exec.tez.TezTask)2 TezWork (org.apache.hadoop.hive.ql.plan.TezWork)2 ArrayList (java.util.ArrayList)1 LinkedHashMap (java.util.LinkedHashMap)1 Path (org.apache.hadoop.fs.Path)1 SQLCheckConstraint (org.apache.hadoop.hive.metastore.api.SQLCheckConstraint)1 SQLDefaultConstraint (org.apache.hadoop.hive.metastore.api.SQLDefaultConstraint)1 SQLNotNullConstraint (org.apache.hadoop.hive.metastore.api.SQLNotNullConstraint)1 SQLUniqueConstraint (org.apache.hadoop.hive.metastore.api.SQLUniqueConstraint)1 CompilationOpContext (org.apache.hadoop.hive.ql.CompilationOpContext)1 DriverContext (org.apache.hadoop.hive.ql.DriverContext)1 MapRedTask (org.apache.hadoop.hive.ql.exec.mr.MapRedTask)1 MapredLocalTask (org.apache.hadoop.hive.ql.exec.mr.MapredLocalTask)1 MergeFileWork (org.apache.hadoop.hive.ql.io.merge.MergeFileWork)1 CheckConstraint (org.apache.hadoop.hive.ql.metadata.CheckConstraint)1 DefaultConstraint (org.apache.hadoop.hive.ql.metadata.DefaultConstraint)1 NotNullConstraint (org.apache.hadoop.hive.ql.metadata.NotNullConstraint)1 UniqueConstraint (org.apache.hadoop.hive.ql.metadata.UniqueConstraint)1 FileMergeDesc (org.apache.hadoop.hive.ql.plan.FileMergeDesc)1