Search in sources :

Example 6 with MoveWork

use of org.apache.hadoop.hive.ql.plan.MoveWork in project hive by apache.

the class ImportSemanticAnalyzer method loadTable.

private static Task<?> loadTable(URI fromURI, Table table, boolean replace, Path tgtPath, ReplicationSpec replicationSpec, EximUtil.SemanticAnalyzerWrapperContext x, Long writeId, int stmtId, boolean isSourceMm) {
    Path dataPath = new Path(fromURI.toString(), EximUtil.DATA_PATH_NAME);
    Path destPath = null, loadPath = null;
    LoadFileType lft;
    if (AcidUtils.isInsertOnlyTable(table)) {
        String mmSubdir = replace ? AcidUtils.baseDir(writeId) : AcidUtils.deltaSubdir(writeId, writeId, stmtId);
        destPath = new Path(tgtPath, mmSubdir);
        loadPath = tgtPath;
        lft = LoadFileType.KEEP_EXISTING;
    } else {
        destPath = loadPath = x.getCtx().getExternalTmpPath(tgtPath);
        lft = replace ? LoadFileType.REPLACE_ALL : LoadFileType.OVERWRITE_EXISTING;
    }
    if (Utilities.FILE_OP_LOGGER.isTraceEnabled()) {
        Utilities.FILE_OP_LOGGER.trace("adding import work for table with source location: " + dataPath + "; table: " + tgtPath + "; copy destination " + destPath + "; mm " + writeId + " (src " + isSourceMm + ") for " + (table == null ? "a new table" : table.getTableName()));
    }
    Task<?> copyTask = null;
    if (replicationSpec.isInReplicationScope()) {
        if (isSourceMm || isAcid(writeId)) {
            // Note: this is replication gap, not MM gap... Repl V2 is not ready yet.
            throw new RuntimeException("Replicating MM and ACID tables is not supported");
        }
        copyTask = ReplCopyTask.getLoadCopyTask(replicationSpec, dataPath, destPath, x.getConf());
    } else {
        CopyWork cw = new CopyWork(dataPath, destPath, false);
        cw.setSkipSourceMmDirs(isSourceMm);
        copyTask = TaskFactory.get(cw);
    }
    LoadTableDesc loadTableWork = new LoadTableDesc(loadPath, Utilities.getTableDesc(table), new TreeMap<>(), lft, writeId);
    loadTableWork.setStmtId(stmtId);
    MoveWork mv = new MoveWork(x.getInputs(), x.getOutputs(), loadTableWork, null, false);
    Task<?> loadTableTask = TaskFactory.get(mv);
    copyTask.addDependentTask(loadTableTask);
    x.getTasks().add(copyTask);
    return loadTableTask;
}
Also used : Path(org.apache.hadoop.fs.Path) MoveWork(org.apache.hadoop.hive.ql.plan.MoveWork) LoadTableDesc(org.apache.hadoop.hive.ql.plan.LoadTableDesc) LoadFileType(org.apache.hadoop.hive.ql.plan.LoadTableDesc.LoadFileType) CopyWork(org.apache.hadoop.hive.ql.plan.CopyWork)

Example 7 with MoveWork

use of org.apache.hadoop.hive.ql.plan.MoveWork in project hive by apache.

the class GenMapRedUtils method createMRWorkForMergingFiles.

/**
 * @param fsInput The FileSink operator.
 * @param finalName the final destination path the merge job should output.
 * @param dependencyTask
 * @param mvTasks
 * @param conf
 * @param currTask
 * @param lineageState
 * @throws SemanticException
 *
 * create a Map-only merge job using CombineHiveInputFormat for all partitions with
 * following operators:
 *          MR job J0:
 *          ...
 *          |
 *          v
 *          FileSinkOperator_1 (fsInput)
 *          |
 *          v
 *          Merge job J1:
 *          |
 *          v
 *          TableScan (using CombineHiveInputFormat) (tsMerge)
 *          |
 *          v
 *          FileSinkOperator (fsMerge)
 *
 *          Here the pathToPartitionInfo & pathToAlias will remain the same, which means the paths
 *          do
 *          not contain the dynamic partitions (their parent). So after the dynamic partitions are
 *          created (after the first job finished before the moveTask or ConditionalTask start),
 *          we need to change the pathToPartitionInfo & pathToAlias to include the dynamic
 *          partition
 *          directories.
 */
public static void createMRWorkForMergingFiles(FileSinkOperator fsInput, Path finalName, DependencyCollectionTask dependencyTask, List<Task<MoveWork>> mvTasks, HiveConf conf, Task<? extends Serializable> currTask, LineageState lineageState) throws SemanticException {
    // 
    // 1. create the operator tree
    // 
    FileSinkDesc fsInputDesc = fsInput.getConf();
    if (Utilities.FILE_OP_LOGGER.isTraceEnabled()) {
        Utilities.FILE_OP_LOGGER.trace("Creating merge work from " + System.identityHashCode(fsInput) + " with write ID " + (fsInputDesc.isMmTable() ? fsInputDesc.getTableWriteId() : null) + " into " + finalName);
    }
    boolean isBlockMerge = (conf.getBoolVar(ConfVars.HIVEMERGERCFILEBLOCKLEVEL) && fsInputDesc.getTableInfo().getInputFileFormatClass().equals(RCFileInputFormat.class)) || (conf.getBoolVar(ConfVars.HIVEMERGEORCFILESTRIPELEVEL) && fsInputDesc.getTableInfo().getInputFileFormatClass().equals(OrcInputFormat.class));
    RowSchema inputRS = fsInput.getSchema();
    Long srcMmWriteId = fsInputDesc.isMmTable() ? fsInputDesc.getTableWriteId() : null;
    FileSinkDesc fsOutputDesc = null;
    TableScanOperator tsMerge = null;
    if (!isBlockMerge) {
        // Create a TableScan operator
        tsMerge = GenMapRedUtils.createTemporaryTableScanOperator(fsInput.getCompilationOpContext(), inputRS);
        // Create a FileSink operator
        TableDesc ts = (TableDesc) fsInputDesc.getTableInfo().clone();
        Path mergeDest = srcMmWriteId == null ? finalName : finalName.getParent();
        fsOutputDesc = new FileSinkDesc(mergeDest, ts, conf.getBoolVar(ConfVars.COMPRESSRESULT));
        fsOutputDesc.setMmWriteId(srcMmWriteId);
        fsOutputDesc.setIsMerge(true);
        // Create and attach the filesink for the merge.
        OperatorFactory.getAndMakeChild(fsOutputDesc, inputRS, tsMerge);
    }
    // If the input FileSinkOperator is a dynamic partition enabled, the tsMerge input schema
    // needs to include the partition column, and the fsOutput should have
    // a DynamicPartitionCtx to indicate that it needs to dynamically partitioned.
    DynamicPartitionCtx dpCtx = fsInputDesc.getDynPartCtx();
    if (dpCtx != null && dpCtx.getNumDPCols() > 0) {
        // adding DP ColumnInfo to the RowSchema signature
        ArrayList<ColumnInfo> signature = inputRS.getSignature();
        String tblAlias = fsInputDesc.getTableInfo().getTableName();
        for (String dpCol : dpCtx.getDPColNames()) {
            ColumnInfo colInfo = new ColumnInfo(dpCol, // all partition column type should be string
            TypeInfoFactory.stringTypeInfo, tblAlias, // partition column is virtual column
            true);
            signature.add(colInfo);
        }
        inputRS.setSignature(signature);
        if (!isBlockMerge) {
            // create another DynamicPartitionCtx, which has a different input-to-DP column mapping
            DynamicPartitionCtx dpCtx2 = new DynamicPartitionCtx(dpCtx);
            fsOutputDesc.setDynPartCtx(dpCtx2);
        }
        // update the FileSinkOperator to include partition columns
        usePartitionColumns(fsInputDesc.getTableInfo().getProperties(), dpCtx.getDPColNames());
    } else {
        // non-partitioned table
        fsInputDesc.getTableInfo().getProperties().remove(org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.META_TABLE_PARTITION_COLUMNS);
    }
    // 
    // 2. Constructing a conditional task consisting of a move task and a map reduce task
    // 
    Path inputDirName = fsInputDesc.getMergeInputDirName();
    MapWork cplan;
    Serializable work;
    if (isBlockMerge) {
        cplan = GenMapRedUtils.createMergeTask(fsInputDesc, finalName, dpCtx != null && dpCtx.getNumDPCols() > 0, fsInput.getCompilationOpContext());
        if (conf.getVar(ConfVars.HIVE_EXECUTION_ENGINE).equals("tez")) {
            work = new TezWork(conf.getVar(HiveConf.ConfVars.HIVEQUERYID), conf);
            cplan.setName("File Merge");
            ((TezWork) work).add(cplan);
        } else if (conf.getVar(ConfVars.HIVE_EXECUTION_ENGINE).equals("spark")) {
            work = new SparkWork(conf.getVar(HiveConf.ConfVars.HIVEQUERYID));
            cplan.setName("Spark Merge File Work");
            ((SparkWork) work).add(cplan);
        } else {
            work = cplan;
        }
    } else {
        cplan = createMRWorkForMergingFiles(conf, tsMerge, fsInputDesc);
        if (conf.getVar(ConfVars.HIVE_EXECUTION_ENGINE).equals("tez")) {
            work = new TezWork(conf.getVar(HiveConf.ConfVars.HIVEQUERYID), conf);
            cplan.setName("File Merge");
            ((TezWork) work).add(cplan);
        } else if (conf.getVar(ConfVars.HIVE_EXECUTION_ENGINE).equals("spark")) {
            work = new SparkWork(conf.getVar(HiveConf.ConfVars.HIVEQUERYID));
            cplan.setName("Spark Merge File Work");
            ((SparkWork) work).add(cplan);
        } else {
            work = new MapredWork();
            ((MapredWork) work).setMapWork(cplan);
        }
    }
    // use CombineHiveInputFormat for map-only merging
    cplan.setInputformat("org.apache.hadoop.hive.ql.io.CombineHiveInputFormat");
    // NOTE: we should gather stats in MR1 rather than MR2 at merge job since we don't
    // know if merge MR2 will be triggered at execution time
    MoveWork dummyMv = null;
    if (srcMmWriteId == null) {
        // Only create the movework for non-MM table. No action needed for a MM table.
        dummyMv = new MoveWork(null, null, null, new LoadFileDesc(inputDirName, finalName, true, null, null, false), false);
    }
    // Use the original fsOp path here in case of MM - while the new FSOP merges files inside the
    // MM directory, the original MoveTask still commits based on the parent. Note that this path
    // can only be triggered for a merge that's part of insert for now; MM tables do not support
    // concatenate. Keeping the old logic for non-MM tables with temp directories and stuff.
    Path fsopPath = srcMmWriteId != null ? fsInputDesc.getFinalDirName() : finalName;
    Task<MoveWork> mvTask = GenMapRedUtils.findMoveTaskForFsopOutput(mvTasks, fsopPath, fsInputDesc.isMmTable());
    ConditionalTask cndTsk = GenMapRedUtils.createCondTask(conf, currTask, dummyMv, work, fsInputDesc.getMergeInputDirName(), finalName, mvTask, dependencyTask, lineageState);
    // keep the dynamic partition context in conditional task resolver context
    ConditionalResolverMergeFilesCtx mrCtx = (ConditionalResolverMergeFilesCtx) cndTsk.getResolverCtx();
    mrCtx.setDPCtx(fsInputDesc.getDynPartCtx());
    mrCtx.setLbCtx(fsInputDesc.getLbCtx());
}
Also used : Path(org.apache.hadoop.fs.Path) MoveWork(org.apache.hadoop.hive.ql.plan.MoveWork) RowSchema(org.apache.hadoop.hive.ql.exec.RowSchema) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Serializable(java.io.Serializable) LoadFileDesc(org.apache.hadoop.hive.ql.plan.LoadFileDesc) FileSinkDesc(org.apache.hadoop.hive.ql.plan.FileSinkDesc) DynamicPartitionCtx(org.apache.hadoop.hive.ql.plan.DynamicPartitionCtx) ColumnInfo(org.apache.hadoop.hive.ql.exec.ColumnInfo) ConditionalResolverMergeFilesCtx(org.apache.hadoop.hive.ql.plan.ConditionalResolverMergeFiles.ConditionalResolverMergeFilesCtx) SparkWork(org.apache.hadoop.hive.ql.plan.SparkWork) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) MapredWork(org.apache.hadoop.hive.ql.plan.MapredWork) RCFileInputFormat(org.apache.hadoop.hive.ql.io.RCFileInputFormat) OrcInputFormat(org.apache.hadoop.hive.ql.io.orc.OrcInputFormat) ConditionalTask(org.apache.hadoop.hive.ql.exec.ConditionalTask) LoadTableDesc(org.apache.hadoop.hive.ql.plan.LoadTableDesc) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc) TezWork(org.apache.hadoop.hive.ql.plan.TezWork)

Example 8 with MoveWork

use of org.apache.hadoop.hive.ql.plan.MoveWork in project hive by apache.

the class GenMapRedUtils method mergeMovePaths.

/**
 * Merges the given Conditional input path and the linked MoveWork into one only MoveWork.
 * This is an optimization for BlobStore systems to avoid doing two renames or copies that are not necessary.
 *
 * @param condInputPath A path that the ConditionalTask uses as input for its sub-tasks.
 * @param linkedMoveWork A MoveWork that the ConditionalTask uses to link to its sub-tasks.
 * @param lineageState A LineageState used to track what changes.
 * @return A new MoveWork that has the Conditional input path as source and the linkedMoveWork as target.
 */
@VisibleForTesting
protected static MoveWork mergeMovePaths(Path condInputPath, MoveWork linkedMoveWork, LineageState lineageState) {
    MoveWork newWork = new MoveWork(linkedMoveWork);
    LoadFileDesc fileDesc = null;
    LoadTableDesc tableDesc = null;
    if (linkedMoveWork.getLoadFileWork() != null) {
        fileDesc = new LoadFileDesc(linkedMoveWork.getLoadFileWork());
        fileDesc.setSourcePath(condInputPath);
        lineageState.updateDirToOpMap(condInputPath, linkedMoveWork.getLoadFileWork().getSourcePath());
    } else if (linkedMoveWork.getLoadTableWork() != null) {
        tableDesc = new LoadTableDesc(linkedMoveWork.getLoadTableWork());
        tableDesc.setSourcePath(condInputPath);
        lineageState.updateDirToOpMap(condInputPath, linkedMoveWork.getLoadTableWork().getSourcePath());
    } else {
        throw new IllegalArgumentException("Merging a path with a MoveWork with multi-files work is not allowed.");
    }
    newWork.setLoadFileWork(fileDesc);
    newWork.setLoadTableWork(tableDesc);
    return newWork;
}
Also used : MoveWork(org.apache.hadoop.hive.ql.plan.MoveWork) LoadTableDesc(org.apache.hadoop.hive.ql.plan.LoadTableDesc) LoadFileDesc(org.apache.hadoop.hive.ql.plan.LoadFileDesc) VisibleForTesting(com.google.common.annotations.VisibleForTesting)

Example 9 with MoveWork

use of org.apache.hadoop.hive.ql.plan.MoveWork in project hive by apache.

the class GenMapRedUtils method createMoveTask.

/**
 * Create and add any dependent move tasks
 *
 * @param currTask
 * @param chDir
 * @param fsOp
 * @param parseCtx
 * @param mvTasks
 * @param hconf
 * @param dependencyTask
 * @return
 */
public static Path createMoveTask(Task<? extends Serializable> currTask, boolean chDir, FileSinkOperator fsOp, ParseContext parseCtx, List<Task<MoveWork>> mvTasks, HiveConf hconf, DependencyCollectionTask dependencyTask) {
    Path dest = null;
    FileSinkDesc fileSinkDesc = fsOp.getConf();
    boolean isMmTable = fileSinkDesc.isMmTable();
    if (chDir) {
        dest = fileSinkDesc.getMergeInputDirName();
        if (!isMmTable) {
            // generate the temporary file
            // it must be on the same file system as the current destination
            Context baseCtx = parseCtx.getContext();
            // Create the required temporary file in the HDFS location if the destination
            // path of the FileSinkOperator table is a blobstore path.
            Path tmpDir = baseCtx.getTempDirForFinalJobPath(fileSinkDesc.getDestPath());
            // Change all the linked file sink descriptors
            if (fileSinkDesc.isLinkedFileSink()) {
                for (FileSinkDesc fsConf : fileSinkDesc.getLinkedFileSinkDesc()) {
                    fsConf.setDirName(new Path(tmpDir, fsConf.getDirName().getName()));
                    if (Utilities.FILE_OP_LOGGER.isTraceEnabled()) {
                        Utilities.FILE_OP_LOGGER.trace("createMoveTask setting tmpDir for LinkedFileSink chDir " + fsConf.getDirName() + "; dest was " + fileSinkDesc.getDestPath());
                    }
                }
            } else {
                fileSinkDesc.setDirName(tmpDir);
                if (Utilities.FILE_OP_LOGGER.isTraceEnabled()) {
                    Utilities.FILE_OP_LOGGER.trace("createMoveTask setting tmpDir chDir " + tmpDir + "; dest was " + fileSinkDesc.getDestPath());
                }
            }
        }
    }
    Task<MoveWork> mvTask = null;
    if (!chDir) {
        mvTask = GenMapRedUtils.findMoveTaskForFsopOutput(mvTasks, fsOp.getConf().getFinalDirName(), fsOp.getConf().isMmTable());
    }
    // Set the move task to be dependent on the current task
    if (mvTask != null) {
        GenMapRedUtils.addDependentMoveTasks(mvTask, hconf, currTask, dependencyTask);
    }
    return dest;
}
Also used : Path(org.apache.hadoop.fs.Path) ParseContext(org.apache.hadoop.hive.ql.parse.ParseContext) Context(org.apache.hadoop.hive.ql.Context) CompilationOpContext(org.apache.hadoop.hive.ql.CompilationOpContext) MoveWork(org.apache.hadoop.hive.ql.plan.MoveWork) FileSinkDesc(org.apache.hadoop.hive.ql.plan.FileSinkDesc)

Example 10 with MoveWork

use of org.apache.hadoop.hive.ql.plan.MoveWork in project hive by apache.

the class GenMapRedUtils method addStatsTask.

/**
 * Add the StatsTask as a dependent task of the MoveTask
 * because StatsTask will change the Table/Partition metadata. For atomicity, we
 * should not change it before the data is actually there done by MoveTask.
 *
 * @param nd
 *          the FileSinkOperator whose results are taken care of by the MoveTask.
 * @param mvTask
 *          The MoveTask that moves the FileSinkOperator's results.
 * @param currTask
 *          The MapRedTask that the FileSinkOperator belongs to.
 * @param hconf
 *          HiveConf
 */
public static void addStatsTask(FileSinkOperator nd, MoveTask mvTask, Task<? extends Serializable> currTask, HiveConf hconf) {
    MoveWork mvWork = mvTask.getWork();
    BasicStatsWork statsWork = null;
    Table table = null;
    boolean truncate = false;
    if (mvWork.getLoadTableWork() != null) {
        statsWork = new BasicStatsWork(mvWork.getLoadTableWork());
        String tableName = mvWork.getLoadTableWork().getTable().getTableName();
        truncate = mvWork.getLoadTableWork().getReplace();
        try {
            table = Hive.get().getTable(SessionState.get().getCurrentDatabase(), tableName);
        } catch (HiveException e) {
            throw new RuntimeException("unexpected; table should be present already..: " + tableName, e);
        }
    } else if (mvWork.getLoadFileWork() != null) {
        statsWork = new BasicStatsWork(mvWork.getLoadFileWork());
        truncate = true;
        if (mvWork.getLoadFileWork().getCtasCreateTableDesc() != null) {
            try {
                table = mvWork.getLoadFileWork().getCtasCreateTableDesc().toTable(hconf);
            } catch (HiveException e) {
                LOG.debug("can't pre-create table for CTAS", e);
                table = null;
            }
        } else if (mvWork.getLoadFileWork().getCreateViewDesc() != null) {
            // CREATE MATERIALIZED VIEW ...
            try {
                table = mvWork.getLoadFileWork().getCreateViewDesc().toTable(hconf);
            } catch (HiveException e) {
                LOG.debug("can't pre-create table for MV", e);
                table = null;
            }
        } else {
            throw new RuntimeException("unexpected; this should be a CTAS or a CREATE/REBUILD MV - however no desc present");
        }
    }
    assert statsWork != null : "Error when generating StatsTask";
    if (currTask.getWork() instanceof MapredWork) {
        MapredWork mrWork = (MapredWork) currTask.getWork();
        mrWork.getMapWork().setGatheringStats(true);
        if (mrWork.getReduceWork() != null) {
            mrWork.getReduceWork().setGatheringStats(true);
        }
    } else if (currTask.getWork() instanceof SparkWork) {
        SparkWork work = (SparkWork) currTask.getWork();
        for (BaseWork w : work.getAllWork()) {
            w.setGatheringStats(true);
        }
    } else {
        // must be TezWork
        TezWork work = (TezWork) currTask.getWork();
        for (BaseWork w : work.getAllWork()) {
            w.setGatheringStats(true);
        }
    }
    StatsWork columnStatsWork = new StatsWork(table, statsWork, hconf);
    columnStatsWork.collectStatsFromAggregator(nd.getConf());
    columnStatsWork.truncateExisting(truncate);
    columnStatsWork.setSourceTask(currTask);
    Task<? extends Serializable> statsTask = TaskFactory.get(columnStatsWork);
    // subscribe feeds from the MoveTask so that MoveTask can forward the list
    // of dynamic partition list to the StatsTask
    mvTask.addDependentTask(statsTask);
    statsTask.subscribeFeed(mvTask);
}
Also used : MoveWork(org.apache.hadoop.hive.ql.plan.MoveWork) Table(org.apache.hadoop.hive.ql.metadata.Table) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) SparkWork(org.apache.hadoop.hive.ql.plan.SparkWork) MapredWork(org.apache.hadoop.hive.ql.plan.MapredWork) StatsWork(org.apache.hadoop.hive.ql.plan.StatsWork) BasicStatsWork(org.apache.hadoop.hive.ql.plan.BasicStatsWork) BasicStatsWork(org.apache.hadoop.hive.ql.plan.BasicStatsWork) BaseWork(org.apache.hadoop.hive.ql.plan.BaseWork) TezWork(org.apache.hadoop.hive.ql.plan.TezWork)

Aggregations

MoveWork (org.apache.hadoop.hive.ql.plan.MoveWork)29 Path (org.apache.hadoop.fs.Path)21 LoadTableDesc (org.apache.hadoop.hive.ql.plan.LoadTableDesc)16 LoadFileDesc (org.apache.hadoop.hive.ql.plan.LoadFileDesc)9 Test (org.junit.Test)7 ConditionalTask (org.apache.hadoop.hive.ql.exec.ConditionalTask)6 HiveException (org.apache.hadoop.hive.ql.metadata.HiveException)6 TableDesc (org.apache.hadoop.hive.ql.plan.TableDesc)6 Task (org.apache.hadoop.hive.ql.exec.Task)5 DDLWork (org.apache.hadoop.hive.ql.plan.DDLWork)5 Serializable (java.io.Serializable)4 ArrayList (java.util.ArrayList)4 Context (org.apache.hadoop.hive.ql.Context)4 FileSinkOperator (org.apache.hadoop.hive.ql.exec.FileSinkOperator)4 MoveTask (org.apache.hadoop.hive.ql.exec.MoveTask)4 MapRedTask (org.apache.hadoop.hive.ql.exec.mr.MapRedTask)4 Partition (org.apache.hadoop.hive.ql.metadata.Partition)4 BasicStatsWork (org.apache.hadoop.hive.ql.plan.BasicStatsWork)4 StatsWork (org.apache.hadoop.hive.ql.plan.StatsWork)4 URISyntaxException (java.net.URISyntaxException)3