Search in sources :

Example 1 with CopyWork

use of org.apache.hadoop.hive.ql.plan.CopyWork in project hive by apache.

the class ImportSemanticAnalyzer method loadTable.

private static Task<?> loadTable(URI fromURI, Table table, boolean replace, Path tgtPath, ReplicationSpec replicationSpec, EximUtil.SemanticAnalyzerWrapperContext x, Long writeId, int stmtId, boolean isSourceMm) {
    Path dataPath = new Path(fromURI.toString(), EximUtil.DATA_PATH_NAME);
    Path destPath = null, loadPath = null;
    LoadFileType lft;
    if (AcidUtils.isInsertOnlyTable(table)) {
        String mmSubdir = replace ? AcidUtils.baseDir(writeId) : AcidUtils.deltaSubdir(writeId, writeId, stmtId);
        destPath = new Path(tgtPath, mmSubdir);
        loadPath = tgtPath;
        lft = LoadFileType.KEEP_EXISTING;
    } else {
        destPath = loadPath = x.getCtx().getExternalTmpPath(tgtPath);
        lft = replace ? LoadFileType.REPLACE_ALL : LoadFileType.OVERWRITE_EXISTING;
    }
    if (Utilities.FILE_OP_LOGGER.isTraceEnabled()) {
        Utilities.FILE_OP_LOGGER.trace("adding import work for table with source location: " + dataPath + "; table: " + tgtPath + "; copy destination " + destPath + "; mm " + writeId + " (src " + isSourceMm + ") for " + (table == null ? "a new table" : table.getTableName()));
    }
    Task<?> copyTask = null;
    if (replicationSpec.isInReplicationScope()) {
        if (isSourceMm || isAcid(writeId)) {
            // Note: this is replication gap, not MM gap... Repl V2 is not ready yet.
            throw new RuntimeException("Replicating MM and ACID tables is not supported");
        }
        copyTask = ReplCopyTask.getLoadCopyTask(replicationSpec, dataPath, destPath, x.getConf());
    } else {
        CopyWork cw = new CopyWork(dataPath, destPath, false);
        cw.setSkipSourceMmDirs(isSourceMm);
        copyTask = TaskFactory.get(cw);
    }
    LoadTableDesc loadTableWork = new LoadTableDesc(loadPath, Utilities.getTableDesc(table), new TreeMap<>(), lft, writeId);
    loadTableWork.setStmtId(stmtId);
    MoveWork mv = new MoveWork(x.getInputs(), x.getOutputs(), loadTableWork, null, false);
    Task<?> loadTableTask = TaskFactory.get(mv);
    copyTask.addDependentTask(loadTableTask);
    x.getTasks().add(copyTask);
    return loadTableTask;
}
Also used : Path(org.apache.hadoop.fs.Path) MoveWork(org.apache.hadoop.hive.ql.plan.MoveWork) LoadTableDesc(org.apache.hadoop.hive.ql.plan.LoadTableDesc) LoadFileType(org.apache.hadoop.hive.ql.plan.LoadTableDesc.LoadFileType) CopyWork(org.apache.hadoop.hive.ql.plan.CopyWork)

Example 2 with CopyWork

use of org.apache.hadoop.hive.ql.plan.CopyWork in project hive by apache.

the class ReplCopyTask method getLoadCopyTask.

private static Task<?> getLoadCopyTask(ReplicationSpec replicationSpec, Path srcPath, Path dstPath, HiveConf conf, boolean isAutoPurge, boolean needRecycle, boolean readSourceAsFileList, boolean overWrite, boolean deleteDestination, String dumpDirectory, ReplicationMetricCollector metricCollector) {
    Task<?> copyTask = null;
    LOG.debug("ReplCopyTask:getLoadCopyTask: {}=>{}", srcPath, dstPath);
    if ((replicationSpec != null) && replicationSpec.isInReplicationScope()) {
        ReplCopyWork rcwork = new ReplCopyWork(srcPath, dstPath, false, overWrite, dumpDirectory, metricCollector);
        rcwork.setReadSrcAsFilesList(readSourceAsFileList);
        if (replicationSpec.isReplace() && deleteDestination) {
            rcwork.setDeleteDestIfExist(true);
            rcwork.setAutoPurge(isAutoPurge);
            rcwork.setNeedRecycle(needRecycle);
        }
        // For replace case, duplicate check should not be done. The new base directory will automatically make the older
        // data invisible. Doing duplicate check and ignoring copy will cause consistency issue if there are multiple
        // replace events getting replayed in the first incremental load.
        rcwork.setCheckDuplicateCopy(replicationSpec.needDupCopyCheck() && !replicationSpec.isReplace());
        LOG.debug("ReplCopyTask:\trcwork");
        String distCpDoAsUser = conf.getVar(HiveConf.ConfVars.HIVE_DISTCP_DOAS_USER);
        rcwork.setDistCpDoAsUser(distCpDoAsUser);
        copyTask = TaskFactory.get(rcwork, conf);
    } else {
        LOG.debug("ReplCopyTask:\tcwork");
        copyTask = TaskFactory.get(new CopyWork(srcPath, dstPath, false, dumpDirectory, metricCollector, true), conf);
    }
    return copyTask;
}
Also used : ReplCopyWork(org.apache.hadoop.hive.ql.plan.ReplCopyWork) ReplCopyWork(org.apache.hadoop.hive.ql.plan.ReplCopyWork) CopyWork(org.apache.hadoop.hive.ql.plan.CopyWork)

Example 3 with CopyWork

use of org.apache.hadoop.hive.ql.plan.CopyWork in project hive by apache.

the class ImportSemanticAnalyzer method addSinglePartition.

private static Task<?> addSinglePartition(ImportTableDesc tblDesc, Table table, Warehouse wh, AlterTableAddPartitionDesc addPartitionDesc, ReplicationSpec replicationSpec, EximUtil.SemanticAnalyzerWrapperContext x, Long writeId, int stmtId, boolean isReplication, String dumpRoot, ReplicationMetricCollector metricCollector) throws MetaException, IOException, HiveException {
    AlterTableAddPartitionDesc.PartitionDesc partSpec = addPartitionDesc.getPartitions().get(0);
    boolean isSkipTrash = false;
    boolean needRecycle = false;
    if (shouldSkipDataCopyInReplScope(tblDesc, replicationSpec) || (tblDesc.isExternal() && tblDesc.getLocation() == null)) {
        x.getLOG().debug("Adding AddPart and skipped data copy for partition " + partSpecToString(partSpec.getPartSpec()));
        // addPartitionDesc already has the right partition location
        @SuppressWarnings("unchecked") Task<?> addPartTask = TaskFactory.get(new DDLWork(x.getInputs(), x.getOutputs(), addPartitionDesc, isReplication, dumpRoot, metricCollector), x.getConf());
        return addPartTask;
    } else {
        String srcLocation = partSpec.getLocation();
        if (replicationSpec.isInReplicationScope() && !ReplicationSpec.Type.IMPORT.equals(replicationSpec.getReplSpecType())) {
            Path partLocation = new Path(partSpec.getLocation());
            Path dataDirBase = partLocation.getParent();
            String bucketDir = partLocation.getName();
            for (int i = 1; i < partSpec.getPartSpec().size(); i++) {
                bucketDir = dataDirBase.getName() + File.separator + bucketDir;
                dataDirBase = dataDirBase.getParent();
            }
            String relativePartDataPath = EximUtil.DATA_PATH_NAME + File.separator + bucketDir;
            srcLocation = new Path(dataDirBase, relativePartDataPath).toString();
        }
        fixLocationInPartSpec(tblDesc, table, wh, replicationSpec, partSpec, x);
        x.getLOG().debug("adding dependent CopyWork/AddPart/MoveWork for partition " + partSpecToString(partSpec.getPartSpec()) + " with source location: " + srcLocation);
        Path tgtLocation = new Path(partSpec.getLocation());
        LoadFileType loadFileType;
        Path destPath;
        if (replicationSpec.isInReplicationScope()) {
            loadFileType = LoadFileType.IGNORE;
            destPath = tgtLocation;
            isSkipTrash = MetaStoreUtils.isSkipTrash(table.getParameters());
            if (table.isTemporary()) {
                needRecycle = false;
            } else {
                org.apache.hadoop.hive.metastore.api.Database db = x.getHive().getDatabase(table.getDbName());
                needRecycle = db != null && ReplChangeManager.shouldEnableCm(db, table.getTTable());
            }
        } else {
            loadFileType = replicationSpec.isReplace() ? LoadFileType.REPLACE_ALL : LoadFileType.OVERWRITE_EXISTING;
            // Replication scope the write id will be invalid
            boolean useStagingDirectory = !AcidUtils.isTransactionalTable(table.getParameters()) || replicationSpec.isInReplicationScope();
            destPath = useStagingDirectory ? x.getCtx().getExternalTmpPath(tgtLocation) : new Path(tgtLocation, AcidUtils.deltaSubdir(writeId, writeId, stmtId));
        }
        Path moveTaskSrc = !AcidUtils.isTransactionalTable(table.getParameters()) || replicationSpec.isInReplicationScope() ? destPath : tgtLocation;
        if (Utilities.FILE_OP_LOGGER.isTraceEnabled()) {
            Utilities.FILE_OP_LOGGER.trace("adding import work for partition with source location: " + srcLocation + "; target: " + tgtLocation + "; copy dest " + destPath + "; mm " + writeId + " for " + partSpecToString(partSpec.getPartSpec()) + ": " + (AcidUtils.isFullAcidTable(table) ? "acid" : (AcidUtils.isInsertOnlyTable(table) ? "mm" : "flat")));
        }
        Task<?> copyTask = null;
        if (replicationSpec.isInReplicationScope()) {
            boolean copyAtLoad = x.getConf().getBoolVar(HiveConf.ConfVars.REPL_RUN_DATA_COPY_TASKS_ON_TARGET);
            copyTask = ReplCopyTask.getLoadCopyTask(replicationSpec, new Path(srcLocation), destPath, x.getConf(), isSkipTrash, needRecycle, copyAtLoad, dumpRoot, metricCollector);
        } else {
            copyTask = TaskFactory.get(new CopyWork(new Path(srcLocation), destPath, false, dumpRoot, metricCollector, isReplication));
        }
        Task<?> addPartTask = null;
        if (x.getEventType() != DumpType.EVENT_COMMIT_TXN) {
            // During replication, by the time we are applying commit transaction event, we expect
            // the partition/s to be already added or altered by previous events. So no need to
            // create add partition event again.
            addPartTask = TaskFactory.get(new DDLWork(x.getInputs(), x.getOutputs(), addPartitionDesc, isReplication, dumpRoot, metricCollector), x.getConf());
        }
        MoveWork moveWork = new MoveWork(x.getInputs(), x.getOutputs(), null, null, false, dumpRoot, metricCollector, isReplication);
        // See setLoadFileType and setIsAcidIow calls elsewhere for an example.
        if (replicationSpec.isInReplicationScope() && AcidUtils.isTransactionalTable(tblDesc.getTblProps())) {
            LoadMultiFilesDesc loadFilesWork = new LoadMultiFilesDesc(Collections.singletonList(destPath), Collections.singletonList(tgtLocation), true, null, null);
            moveWork.setMultiFilesDesc(loadFilesWork);
            moveWork.setNeedCleanTarget(replicationSpec.isReplace());
        } else {
            LoadTableDesc loadTableWork = new LoadTableDesc(moveTaskSrc, Utilities.getTableDesc(table), partSpec.getPartSpec(), loadFileType, writeId);
            loadTableWork.setStmtId(stmtId);
            loadTableWork.setInheritTableSpecs(false);
            moveWork.setLoadTableWork(loadTableWork);
        }
        if (loadFileType == LoadFileType.IGNORE) {
            // update which is again done in load operations as part of move task.
            if (x.getEventType() == DumpType.EVENT_INSERT) {
                copyTask.addDependentTask(TaskFactory.get(moveWork, x.getConf()));
            } else {
                if (addPartTask != null) {
                    copyTask.addDependentTask(addPartTask);
                }
            }
            return copyTask;
        }
        Task<?> loadPartTask = TaskFactory.get(moveWork, x.getConf());
        copyTask.addDependentTask(loadPartTask);
        if (addPartTask != null) {
            addPartTask.addDependentTask(loadPartTask);
            x.getTasks().add(copyTask);
            return addPartTask;
        }
        return copyTask;
    }
}
Also used : Path(org.apache.hadoop.fs.Path) MoveWork(org.apache.hadoop.hive.ql.plan.MoveWork) AlterTableAddPartitionDesc(org.apache.hadoop.hive.ql.ddl.table.partition.add.AlterTableAddPartitionDesc) LoadFileType(org.apache.hadoop.hive.ql.plan.LoadTableDesc.LoadFileType) LoadMultiFilesDesc(org.apache.hadoop.hive.ql.plan.LoadMultiFilesDesc) LoadTableDesc(org.apache.hadoop.hive.ql.plan.LoadTableDesc) DDLWork(org.apache.hadoop.hive.ql.ddl.DDLWork) CopyWork(org.apache.hadoop.hive.ql.plan.CopyWork) Database(org.apache.hadoop.hive.metastore.api.Database)

Example 4 with CopyWork

use of org.apache.hadoop.hive.ql.plan.CopyWork in project hive by apache.

the class ImportSemanticAnalyzer method loadTable.

private static Task<?> loadTable(URI fromURI, Table table, boolean replace, Path tgtPath, ReplicationSpec replicationSpec, EximUtil.SemanticAnalyzerWrapperContext x, Long writeId, int stmtId, String dumpRoot, ReplicationMetricCollector metricCollector) throws HiveException {
    assert table != null;
    assert table.getParameters() != null;
    Path dataPath = new Path(fromURI.toString(), EximUtil.DATA_PATH_NAME);
    Path destPath = null, loadPath = null;
    LoadFileType lft;
    boolean isSkipTrash = false;
    boolean needRecycle = false;
    if (replicationSpec.isInReplicationScope()) {
        isSkipTrash = MetaStoreUtils.isSkipTrash(table.getParameters());
        if (table.isTemporary()) {
            needRecycle = false;
        } else {
            org.apache.hadoop.hive.metastore.api.Database db = x.getHive().getDatabase(table.getDbName());
            needRecycle = db != null && ReplChangeManager.shouldEnableCm(db, table.getTTable());
        }
    }
    if (AcidUtils.isTransactionalTable(table)) {
        String mmSubdir = replace ? AcidUtils.baseDir(writeId) : AcidUtils.deltaSubdir(writeId, writeId, stmtId);
        destPath = new Path(tgtPath, mmSubdir);
        /**
         * CopyTask below will copy files from the 'archive' to a delta_x_x in the table/partition
         * directory, i.e. the final destination for these files.  This has to be a copy to preserve
         * the archive.  MoveTask is optimized to do a 'rename' if files are on the same FileSystem.
         * So setting 'loadPath' this way will make
         * {@link Hive#loadTable(Path, String, LoadFileType, boolean, boolean, boolean,
         * boolean, Long, int)}
         * skip the unnecessary file (rename) operation but it will perform other things.
         */
        loadPath = tgtPath;
        lft = LoadFileType.KEEP_EXISTING;
    } else {
        destPath = loadPath = x.getCtx().getExternalTmpPath(tgtPath);
        lft = replace ? LoadFileType.REPLACE_ALL : LoadFileType.OVERWRITE_EXISTING;
    }
    if (Utilities.FILE_OP_LOGGER.isTraceEnabled()) {
        Utilities.FILE_OP_LOGGER.trace("adding import work for table with source location: " + dataPath + "; table: " + tgtPath + "; copy destination " + destPath + "; mm " + writeId + " for " + table.getTableName() + ": " + (AcidUtils.isFullAcidTable(table) ? "acid" : (AcidUtils.isInsertOnlyTable(table) ? "mm" : "flat")));
    }
    Task<?> copyTask = null;
    if (replicationSpec.isInReplicationScope()) {
        boolean copyAtLoad = x.getConf().getBoolVar(HiveConf.ConfVars.REPL_RUN_DATA_COPY_TASKS_ON_TARGET);
        copyTask = ReplCopyTask.getLoadCopyTask(replicationSpec, dataPath, destPath, x.getConf(), isSkipTrash, needRecycle, copyAtLoad, dumpRoot, metricCollector);
    } else {
        copyTask = TaskFactory.get(new CopyWork(dataPath, destPath, false, dumpRoot, metricCollector, true));
    }
    MoveWork moveWork = new MoveWork(x.getInputs(), x.getOutputs(), null, null, false, dumpRoot, metricCollector, true);
    if (replicationSpec.isInReplicationScope() && AcidUtils.isTransactionalTable(table)) {
        LoadMultiFilesDesc loadFilesWork = new LoadMultiFilesDesc(Collections.singletonList(destPath), Collections.singletonList(tgtPath), true, null, null);
        moveWork.setMultiFilesDesc(loadFilesWork);
        moveWork.setNeedCleanTarget(replace);
    } else {
        LoadTableDesc loadTableWork = new LoadTableDesc(loadPath, Utilities.getTableDesc(table), new TreeMap<>(), lft, writeId);
        loadTableWork.setStmtId(stmtId);
        moveWork.setLoadTableWork(loadTableWork);
    }
    // if Importing into existing table, FileFormat is checked by
    // ImportSemanticAnalyzer.checked checkTable()
    Task<?> loadTableTask = TaskFactory.get(moveWork, x.getConf());
    copyTask.addDependentTask(loadTableTask);
    x.getTasks().add(copyTask);
    return loadTableTask;
}
Also used : Path(org.apache.hadoop.fs.Path) MoveWork(org.apache.hadoop.hive.ql.plan.MoveWork) LoadFileType(org.apache.hadoop.hive.ql.plan.LoadTableDesc.LoadFileType) LoadMultiFilesDesc(org.apache.hadoop.hive.ql.plan.LoadMultiFilesDesc) LoadTableDesc(org.apache.hadoop.hive.ql.plan.LoadTableDesc) CopyWork(org.apache.hadoop.hive.ql.plan.CopyWork) Database(org.apache.hadoop.hive.metastore.api.Database)

Example 5 with CopyWork

use of org.apache.hadoop.hive.ql.plan.CopyWork in project hive by apache.

the class ReplCopyTask method getDumpCopyTask.

public static Task<?> getDumpCopyTask(ReplicationSpec replicationSpec, Path srcPath, Path dstPath, HiveConf conf) {
    Task<?> copyTask = null;
    LOG.debug("ReplCopyTask:getDumpCopyTask: " + srcPath + "=>" + dstPath);
    if (replicationSpec.isInReplicationScope()) {
        ReplCopyWork rcwork = new ReplCopyWork(srcPath, dstPath, false);
        LOG.debug("ReplCopyTask:\trcwork");
        if (replicationSpec.isLazy()) {
            LOG.debug("ReplCopyTask:\tlazy");
            rcwork.setListFilesOnOutputBehaviour(true);
        }
        copyTask = TaskFactory.get(rcwork, conf);
    } else {
        LOG.debug("ReplCopyTask:\tcwork");
        copyTask = TaskFactory.get(new CopyWork(srcPath, dstPath, false), conf);
    }
    return copyTask;
}
Also used : ReplCopyWork(org.apache.hadoop.hive.ql.plan.ReplCopyWork) ReplCopyWork(org.apache.hadoop.hive.ql.plan.ReplCopyWork) CopyWork(org.apache.hadoop.hive.ql.plan.CopyWork)

Aggregations

CopyWork (org.apache.hadoop.hive.ql.plan.CopyWork)7 Path (org.apache.hadoop.fs.Path)4 LoadTableDesc (org.apache.hadoop.hive.ql.plan.LoadTableDesc)4 MoveWork (org.apache.hadoop.hive.ql.plan.MoveWork)4 LoadFileType (org.apache.hadoop.hive.ql.plan.LoadTableDesc.LoadFileType)3 ReplCopyWork (org.apache.hadoop.hive.ql.plan.ReplCopyWork)3 Database (org.apache.hadoop.hive.metastore.api.Database)2 LoadMultiFilesDesc (org.apache.hadoop.hive.ql.plan.LoadMultiFilesDesc)2 DDLWork (org.apache.hadoop.hive.ql.ddl.DDLWork)1 AlterTableAddPartitionDesc (org.apache.hadoop.hive.ql.ddl.table.partition.add.AlterTableAddPartitionDesc)1 AddPartitionDesc (org.apache.hadoop.hive.ql.plan.AddPartitionDesc)1 DDLWork (org.apache.hadoop.hive.ql.plan.DDLWork)1