Search in sources :

Example 31 with MoveWork

use of org.apache.hadoop.hive.ql.plan.MoveWork in project hive by apache.

the class GenSparkUtils method createMoveTask.

/**
 * Create and add any dependent move tasks.
 *
 * This is forked from {@link GenMapRedUtils}. The difference is that it doesn't check
 * 'isLinkedFileSink' and does not set parent dir for the linked file sinks.
 */
public static Path createMoveTask(Task<? extends Serializable> currTask, boolean chDir, FileSinkOperator fsOp, ParseContext parseCtx, List<Task<MoveWork>> mvTasks, HiveConf hconf, DependencyCollectionTask dependencyTask) {
    Path dest = null;
    FileSinkDesc fileSinkDesc = fsOp.getConf();
    if (chDir) {
        dest = fsOp.getConf().getFinalDirName();
        // generate the temporary file
        // it must be on the same file system as the current destination
        Context baseCtx = parseCtx.getContext();
        Path tmpDir = baseCtx.getExternalTmpPath(dest);
        // Change all the linked file sink descriptors
        if (fileSinkDesc.getLinkedFileSinkDesc() != null) {
            for (FileSinkDesc fsConf : fileSinkDesc.getLinkedFileSinkDesc()) {
                fsConf.setDirName(tmpDir);
            }
        } else {
            fileSinkDesc.setDirName(tmpDir);
        }
    }
    Task<MoveWork> mvTask = null;
    if (!chDir) {
        mvTask = GenMapRedUtils.findMoveTaskForFsopOutput(mvTasks, fileSinkDesc.getFinalDirName(), false);
    }
    // Set the move task to be dependent on the current task
    if (mvTask != null) {
        GenMapRedUtils.addDependentMoveTasks(mvTask, hconf, currTask, dependencyTask);
    }
    return dest;
}
Also used : Path(org.apache.hadoop.fs.Path) Context(org.apache.hadoop.hive.ql.Context) ParseContext(org.apache.hadoop.hive.ql.parse.ParseContext) MoveWork(org.apache.hadoop.hive.ql.plan.MoveWork) FileSinkDesc(org.apache.hadoop.hive.ql.plan.FileSinkDesc)

Example 32 with MoveWork

use of org.apache.hadoop.hive.ql.plan.MoveWork in project hive by apache.

the class GenSparkUtils method createMoveTask.

/**
 * Create and add any dependent move tasks.
 *
 * This is forked from {@link GenMapRedUtils}. The difference is that it doesn't check
 * 'isLinkedFileSink' and does not set parent dir for the linked file sinks.
 */
public static Path createMoveTask(Task<?> currTask, boolean chDir, FileSinkOperator fsOp, ParseContext parseCtx, List<Task<MoveWork>> mvTasks, HiveConf hconf, DependencyCollectionTask dependencyTask) {
    Path dest = null;
    FileSinkDesc fileSinkDesc = fsOp.getConf();
    if (chDir) {
        dest = fsOp.getConf().getFinalDirName();
        // generate the temporary file
        // it must be on the same file system as the current destination
        Context baseCtx = parseCtx.getContext();
        Path tmpDir = null;
        // In such case we shouldn't request an external tmp dir as it will end up inside the mr temp dir
        if (baseCtx.isMRTmpFileURI(dest.toUri().getPath())) {
            tmpDir = baseCtx.getMRTmpPath();
        } else {
            tmpDir = baseCtx.getExternalTmpPath(dest);
        }
        // Change all the linked file sink descriptors
        if (fileSinkDesc.getLinkedFileSinkDesc() != null) {
            for (FileSinkDesc fsConf : fileSinkDesc.getLinkedFileSinkDesc()) {
                fsConf.setDirName(tmpDir);
            }
        } else {
            fileSinkDesc.setDirName(tmpDir);
        }
    }
    Task<MoveWork> mvTask = null;
    if (!chDir) {
        mvTask = GenMapRedUtils.findMoveTaskForFsopOutput(mvTasks, fileSinkDesc.getFinalDirName(), false, fileSinkDesc.isDirectInsert(), fileSinkDesc.getMoveTaskId(), fileSinkDesc.getAcidOperation());
    }
    // Set the move task to be dependent on the current task
    if (mvTask != null) {
        GenMapRedUtils.addDependentMoveTasks(mvTask, hconf, currTask, dependencyTask);
    }
    return dest;
}
Also used : Path(org.apache.hadoop.fs.Path) Context(org.apache.hadoop.hive.ql.Context) ParseContext(org.apache.hadoop.hive.ql.parse.ParseContext) MoveWork(org.apache.hadoop.hive.ql.plan.MoveWork) FileSinkDesc(org.apache.hadoop.hive.ql.plan.FileSinkDesc)

Example 33 with MoveWork

use of org.apache.hadoop.hive.ql.plan.MoveWork in project hive by apache.

the class GenMapRedUtils method findMoveTaskForFsopOutput.

public static Task<MoveWork> findMoveTaskForFsopOutput(List<Task<MoveWork>> mvTasks, Path fsopFinalDir, boolean isMmFsop, boolean isDirectInsert, String fsoMoveTaskId, AcidUtils.Operation acidOperation) {
    // find the move task
    for (Task<MoveWork> mvTsk : mvTasks) {
        MoveWork mvWork = mvTsk.getWork();
        Path srcDir = null;
        boolean isLfd = false;
        String moveTaskId = null;
        AcidUtils.Operation moveTaskWriteType = null;
        if (mvWork.getLoadFileWork() != null) {
            srcDir = mvWork.getLoadFileWork().getSourcePath();
            isLfd = true;
            if (isMmFsop || isDirectInsert) {
                srcDir = srcDir.getParent();
            }
            moveTaskId = mvWork.getLoadFileWork().getMoveTaskId();
            moveTaskWriteType = mvWork.getLoadFileWork().getWriteType();
        } else if (mvWork.getLoadTableWork() != null) {
            srcDir = mvWork.getLoadTableWork().getSourcePath();
            moveTaskId = mvWork.getLoadTableWork().getMoveTaskId();
            moveTaskWriteType = mvWork.getLoadTableWork().getWriteType();
        }
        if (Utilities.FILE_OP_LOGGER.isTraceEnabled()) {
            Utilities.FILE_OP_LOGGER.trace("Observing MoveWork " + System.identityHashCode(mvWork) + " with " + srcDir + "(from " + (isLfd ? "LFD" : "LTD") + ") while looking for " + fsopFinalDir + "(mm = " + isMmFsop + ")");
        }
        if ((srcDir != null) && srcDir.equals(fsopFinalDir)) {
            if (isDirectInsert || isMmFsop) {
                if (moveTaskId.equals(fsoMoveTaskId)) {
                    // We need to consider the ACID write type to identify the MoveTasks.
                    return mvTsk;
                }
            } else {
                return mvTsk;
            }
        }
    }
    return null;
}
Also used : MoveWork(org.apache.hadoop.hive.ql.plan.MoveWork) Path(org.apache.hadoop.fs.Path) AcidUtils(org.apache.hadoop.hive.ql.io.AcidUtils)

Example 34 with MoveWork

use of org.apache.hadoop.hive.ql.plan.MoveWork in project hive by apache.

the class GenMapRedUtils method createMoveTask.

/**
 * Create and add any dependent move tasks
 *
 * @param currTask
 * @param chDir
 * @param fsOp
 * @param parseCtx
 * @param mvTasks
 * @param hconf
 * @param dependencyTask
 * @return
 */
public static Path createMoveTask(Task<?> currTask, boolean chDir, FileSinkOperator fsOp, ParseContext parseCtx, List<Task<MoveWork>> mvTasks, HiveConf hconf, DependencyCollectionTask dependencyTask) {
    Path dest = null;
    FileSinkDesc fileSinkDesc = fsOp.getConf();
    boolean isMmTable = fileSinkDesc.isMmTable();
    boolean isDirectInsert = fileSinkDesc.isDirectInsert();
    if (chDir) {
        dest = fileSinkDesc.getMergeInputDirName();
        /**
         * Skip temporary file generation for:
         * 1. MM Tables
         * 2. INSERT operation on full ACID table
         */
        if (!isMmTable && !isDirectInsert) {
            // generate the temporary file
            // it must be on the same file system as the current destination
            Context baseCtx = parseCtx.getContext();
            // Create the required temporary file in the HDFS location if the destination
            // path of the FileSinkOperator table is a blobstore path.
            Path tmpDir = baseCtx.getTempDirForFinalJobPath(fileSinkDesc.getDestPath());
            // Change all the linked file sink descriptors
            if (fileSinkDesc.isLinkedFileSink()) {
                for (FileSinkDesc fsConf : fileSinkDesc.getLinkedFileSinkDesc()) {
                    fsConf.setDirName(new Path(tmpDir, fsConf.getDirName().getName()));
                    if (Utilities.FILE_OP_LOGGER.isTraceEnabled()) {
                        Utilities.FILE_OP_LOGGER.trace("createMoveTask setting tmpDir for LinkedFileSink chDir " + fsConf.getDirName() + "; dest was " + fileSinkDesc.getDestPath());
                    }
                }
            } else {
                fileSinkDesc.setDirName(tmpDir);
                if (Utilities.FILE_OP_LOGGER.isTraceEnabled()) {
                    Utilities.FILE_OP_LOGGER.trace("createMoveTask setting tmpDir chDir " + tmpDir + "; dest was " + fileSinkDesc.getDestPath());
                }
            }
        }
    }
    Task<MoveWork> mvTask = null;
    if (!chDir) {
        mvTask = GenMapRedUtils.findMoveTaskForFsopOutput(mvTasks, fsOp.getConf().getFinalDirName(), isMmTable, isDirectInsert, fsOp.getConf().getMoveTaskId(), fsOp.getConf().getAcidOperation());
    }
    // Set the move task to be dependent on the current task
    if (mvTask != null) {
        GenMapRedUtils.addDependentMoveTasks(mvTask, hconf, currTask, dependencyTask);
    }
    return dest;
}
Also used : Path(org.apache.hadoop.fs.Path) ParseContext(org.apache.hadoop.hive.ql.parse.ParseContext) Context(org.apache.hadoop.hive.ql.Context) CompilationOpContext(org.apache.hadoop.hive.ql.CompilationOpContext) MoveWork(org.apache.hadoop.hive.ql.plan.MoveWork) FileSinkDesc(org.apache.hadoop.hive.ql.plan.FileSinkDesc)

Example 35 with MoveWork

use of org.apache.hadoop.hive.ql.plan.MoveWork in project hive by apache.

the class LoadSemanticAnalyzer method analyzeLoad.

private void analyzeLoad(ASTNode ast) throws SemanticException {
    fromTree = ast.getChild(0);
    tableTree = ast.getChild(1);
    boolean inputInfo = false;
    // Check the last node
    ASTNode child = (ASTNode) ast.getChild(ast.getChildCount() - 1);
    if (child.getToken().getType() == HiveParser.TOK_INPUTFORMAT) {
        if (child.getChildCount() != 2) {
            throw new SemanticException("FileFormat should contain both input format and Serde");
        }
        try {
            inputFormatClassName = stripQuotes(child.getChild(0).getText());
            serDeClassName = stripQuotes(child.getChild(1).getText());
            inputInfo = true;
        } catch (Exception e) {
            throw new SemanticException("FileFormat inputFormatClassName or serDeClassName is incorrect");
        }
    }
    if ((!inputInfo && ast.getChildCount() == 4) || (inputInfo && ast.getChildCount() == 5)) {
        isLocal = true;
        isOverWrite = true;
    }
    if ((!inputInfo && ast.getChildCount() == 3) || (inputInfo && ast.getChildCount() == 4)) {
        if (ast.getChild(2).getText().toLowerCase().equals("local")) {
            isLocal = true;
        } else {
            isOverWrite = true;
        }
    }
    // initialize load path
    URI fromURI;
    try {
        String fromPath = stripQuotes(fromTree.getText());
        fromURI = initializeFromURI(fromPath, isLocal);
    } catch (IOException | URISyntaxException e) {
        throw new SemanticException(ASTErrorUtils.getMsg(ErrorMsg.INVALID_PATH.getMsg(), fromTree, e.getMessage()), e);
    }
    // initialize destination table/partition
    TableSpec ts = new TableSpec(db, conf, (ASTNode) tableTree);
    if (ts.tableHandle.isView() || ts.tableHandle.isMaterializedView()) {
        throw new SemanticException(ErrorMsg.DML_AGAINST_VIEW.getMsg());
    }
    if (ts.tableHandle.isNonNative()) {
        throw new SemanticException(ErrorMsg.LOAD_INTO_NON_NATIVE.getMsg());
    }
    if (ts.tableHandle.isStoredAsSubDirectories()) {
        throw new SemanticException(ErrorMsg.LOAD_INTO_STORED_AS_DIR.getMsg());
    }
    List<FieldSchema> parts = ts.tableHandle.getPartitionKeys();
    if ((parts != null && parts.size() > 0) && (ts.partSpec == null || ts.partSpec.size() == 0)) {
        // launch a tez job
        reparseAndSuperAnalyze(ts.tableHandle, fromURI);
        return;
    }
    List<String> bucketCols = ts.tableHandle.getBucketCols();
    if (bucketCols != null && !bucketCols.isEmpty()) {
        String error = StrictChecks.checkBucketing(conf);
        if (error != null) {
            // launch a tez job
            reparseAndSuperAnalyze(ts.tableHandle, fromURI);
            return;
        }
    }
    // make sure the arguments make sense
    List<FileStatus> files = applyConstraintsAndGetFiles(fromURI, ts.tableHandle);
    if (queryReWritten) {
        return;
    }
    // for managed tables, make sure the file formats match
    if (TableType.MANAGED_TABLE.equals(ts.tableHandle.getTableType()) && conf.getBoolVar(HiveConf.ConfVars.HIVECHECKFILEFORMAT)) {
        ensureFileFormatsMatch(ts, files, fromURI);
    }
    inputs.add(toReadEntity(new Path(fromURI)));
    // create final load/move work
    boolean preservePartitionSpecs = false;
    Map<String, String> partSpec = ts.getPartSpec();
    if (partSpec == null) {
        partSpec = new LinkedHashMap<String, String>();
        outputs.add(new WriteEntity(ts.tableHandle, (isOverWrite ? WriteEntity.WriteType.INSERT_OVERWRITE : WriteEntity.WriteType.INSERT)));
    } else {
        try {
            Partition part = Hive.get().getPartition(ts.tableHandle, partSpec, false);
            if (part != null) {
                if (isOverWrite) {
                    outputs.add(new WriteEntity(part, WriteEntity.WriteType.INSERT_OVERWRITE));
                } else {
                    outputs.add(new WriteEntity(part, WriteEntity.WriteType.INSERT));
                    // If partition already exists and we aren't overwriting it, then respect
                    // its current location info rather than picking it from the parent TableDesc
                    preservePartitionSpecs = true;
                }
            } else {
                outputs.add(new WriteEntity(ts.tableHandle, (isOverWrite ? WriteEntity.WriteType.INSERT_OVERWRITE : WriteEntity.WriteType.INSERT)));
            }
        } catch (HiveException e) {
            throw new SemanticException(e);
        }
    }
    Long writeId = null;
    int stmtId = -1;
    boolean isTxnTable = AcidUtils.isTransactionalTable(ts.tableHandle);
    if (isTxnTable) {
        try {
            writeId = getTxnMgr().getTableWriteId(ts.tableHandle.getDbName(), ts.tableHandle.getTableName());
        } catch (LockException ex) {
            throw new SemanticException("Failed to allocate the write id", ex);
        }
        stmtId = getTxnMgr().getStmtIdAndIncrement();
    }
    // Note: this sets LoadFileType incorrectly for ACID; is that relevant for load?
    // See setLoadFileType and setIsAcidIow calls elsewhere for an example.
    LoadTableDesc loadTableWork = new LoadTableDesc(new Path(fromURI), Utilities.getTableDesc(ts.tableHandle), partSpec, isOverWrite ? LoadFileType.REPLACE_ALL : LoadFileType.KEEP_EXISTING, writeId);
    loadTableWork.setStmtId(stmtId);
    loadTableWork.setInsertOverwrite(isOverWrite);
    if (preservePartitionSpecs) {
        // Note : preservePartitionSpecs=true implies inheritTableSpecs=false but
        // but preservePartitionSpecs=false(default) here is not sufficient enough
        // info to set inheritTableSpecs=true
        loadTableWork.setInheritTableSpecs(false);
    }
    Task<?> childTask = TaskFactory.get(new MoveWork(getInputs(), getOutputs(), loadTableWork, null, true, isLocal));
    rootTasks.add(childTask);
    // The user asked for stats to be collected.
    // Some stats like number of rows require a scan of the data
    // However, some other stats, like number of files, do not require a complete scan
    // Update the stats which do not require a complete scan.
    Task<?> statTask = null;
    if (conf.getBoolVar(HiveConf.ConfVars.HIVESTATSAUTOGATHER)) {
        BasicStatsWork basicStatsWork = new BasicStatsWork(loadTableWork);
        basicStatsWork.setNoStatsAggregator(true);
        basicStatsWork.setClearAggregatorStats(true);
        StatsWork columnStatsWork = new StatsWork(ts.tableHandle, basicStatsWork, conf);
        statTask = TaskFactory.get(columnStatsWork);
    }
    if (statTask != null) {
        childTask.addDependentTask(statTask);
    }
}
Also used : MoveWork(org.apache.hadoop.hive.ql.plan.MoveWork) FileStatus(org.apache.hadoop.fs.FileStatus) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) FieldSchema(org.apache.hadoop.hive.metastore.api.FieldSchema) URISyntaxException(java.net.URISyntaxException) URI(java.net.URI) StatsWork(org.apache.hadoop.hive.ql.plan.StatsWork) BasicStatsWork(org.apache.hadoop.hive.ql.plan.BasicStatsWork) LockException(org.apache.hadoop.hive.ql.lockmgr.LockException) BasicStatsWork(org.apache.hadoop.hive.ql.plan.BasicStatsWork) WriteEntity(org.apache.hadoop.hive.ql.hooks.WriteEntity) Path(org.apache.hadoop.fs.Path) Partition(org.apache.hadoop.hive.ql.metadata.Partition) IOException(java.io.IOException) URISyntaxException(java.net.URISyntaxException) LockException(org.apache.hadoop.hive.ql.lockmgr.LockException) DecoderException(org.apache.commons.codec.DecoderException) IOException(java.io.IOException) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) LoadTableDesc(org.apache.hadoop.hive.ql.plan.LoadTableDesc)

Aggregations

MoveWork (org.apache.hadoop.hive.ql.plan.MoveWork)42 Path (org.apache.hadoop.fs.Path)30 LoadTableDesc (org.apache.hadoop.hive.ql.plan.LoadTableDesc)24 LoadFileDesc (org.apache.hadoop.hive.ql.plan.LoadFileDesc)11 HiveException (org.apache.hadoop.hive.ql.metadata.HiveException)10 TableDesc (org.apache.hadoop.hive.ql.plan.TableDesc)8 Context (org.apache.hadoop.hive.ql.Context)7 ConditionalTask (org.apache.hadoop.hive.ql.exec.ConditionalTask)7 Task (org.apache.hadoop.hive.ql.exec.Task)7 Partition (org.apache.hadoop.hive.ql.metadata.Partition)7 FileSinkDesc (org.apache.hadoop.hive.ql.plan.FileSinkDesc)7 Test (org.junit.Test)7 ArrayList (java.util.ArrayList)6 BasicStatsWork (org.apache.hadoop.hive.ql.plan.BasicStatsWork)6 LoadMultiFilesDesc (org.apache.hadoop.hive.ql.plan.LoadMultiFilesDesc)6 StatsWork (org.apache.hadoop.hive.ql.plan.StatsWork)6 Serializable (java.io.Serializable)5 MetaException (org.apache.hadoop.hive.metastore.api.MetaException)5 MoveTask (org.apache.hadoop.hive.ql.exec.MoveTask)5 DDLWork (org.apache.hadoop.hive.ql.plan.DDLWork)5