use of org.apache.hadoop.hive.ql.plan.LoadFileDesc in project hive by apache.
the class GenMapRedUtils method mergeMovePaths.
* Merges the given Conditional input path and the linked MoveWork into one only MoveWork.
* This is an optimization for BlobStore systems to avoid doing two renames or copies that are not necessary.
* @param condInputPath A path that the ConditionalTask uses as input for its sub-tasks.
* @param linkedMoveWork A MoveWork that the ConditionalTask uses to link to its sub-tasks.
* @param lineageState A LineageState used to track what changes.
* @return A new MoveWork that has the Conditional input path as source and the linkedMoveWork as target.
protected static MoveWork mergeMovePaths(Path condInputPath, MoveWork linkedMoveWork, LineageState lineageState) {
MoveWork newWork = new MoveWork(linkedMoveWork);
LoadFileDesc fileDesc = null;
LoadTableDesc tableDesc = null;
if (linkedMoveWork.getLoadFileWork() != null) {
fileDesc = new LoadFileDesc(linkedMoveWork.getLoadFileWork());
lineageState.updateDirToOpMap(condInputPath, linkedMoveWork.getLoadFileWork().getSourcePath());
} else if (linkedMoveWork.getLoadTableWork() != null) {
tableDesc = new LoadTableDesc(linkedMoveWork.getLoadTableWork());
lineageState.updateDirToOpMap(condInputPath, linkedMoveWork.getLoadTableWork().getSourcePath());
} else {
throw new IllegalArgumentException("Merging a path with a MoveWork with multi-files work is not allowed.");
return newWork;
use of org.apache.hadoop.hive.ql.plan.LoadFileDesc in project hive by apache.
the class TestGenMapRedUtilsCreateConditionalTask method testMovePathsThatCannotBeMerged.
public void testMovePathsThatCannotBeMerged() {
final Path condInputPath = new Path("s3a://bucket/scratch/-ext-10000");
final Path condOutputPath = new Path("s3a://bucket/scratch/-ext-10002");
final MoveWork mockWork = mock(MoveWork.class);
assertFalse("A MoveWork null object cannot be merged.", GenMapRedUtils.shouldMergeMovePaths(hiveConf, condInputPath, condOutputPath, null));
hiveConf.set(HiveConf.ConfVars.HIVE_BLOBSTORE_OPTIMIZATIONS_ENABLED.varname, "false");
assertFalse("Merging paths is not allowed when BlobStorage optimizations are disabled.", GenMapRedUtils.shouldMergeMovePaths(hiveConf, condInputPath, condOutputPath, mockWork));
// Enable BlobStore optimizations for the rest of tests
hiveConf.set(HiveConf.ConfVars.HIVE_BLOBSTORE_OPTIMIZATIONS_ENABLED.varname, "true");
when(mockWork.getLoadMultiFilesWork()).thenReturn(new LoadMultiFilesDesc());
assertFalse("Merging paths is not allowed when MultiFileWork is found in the MoveWork object.", GenMapRedUtils.shouldMergeMovePaths(hiveConf, condInputPath, condOutputPath, mockWork));
assertFalse("Merging paths is not allowed when both LoadFileWork & LoadTableWork are found in the MoveWork object.", GenMapRedUtils.shouldMergeMovePaths(hiveConf, condInputPath, condOutputPath, mockWork));
when(mockWork.getLoadFileWork()).thenReturn(new LoadFileDesc(condInputPath, condOutputPath, false, "", "", false));
assertFalse("Merging paths is not allowed when both conditional output path is not equals to MoveWork input path.", GenMapRedUtils.shouldMergeMovePaths(hiveConf, condInputPath, condOutputPath, mockWork));
when(mockWork.getLoadFileWork()).thenReturn(new LoadFileDesc(condOutputPath, new Path("unused"), false, "", "", false));
assertFalse("Merging paths is not allowed when conditional input path is not a BlobStore path.", GenMapRedUtils.shouldMergeMovePaths(hiveConf, new Path("hdfs://hdfs-path"), condOutputPath, mockWork));
when(mockWork.getLoadFileWork()).thenReturn(new LoadFileDesc(condOutputPath, new Path("hdfs://hdfs-path"), false, "", "", false));
assertFalse("Merging paths is not allowed when MoveWork output path is not a BlobStore path.", GenMapRedUtils.shouldMergeMovePaths(hiveConf, condInputPath, condOutputPath, mockWork));
use of org.apache.hadoop.hive.ql.plan.LoadFileDesc in project hive by apache.
the class GenMapRedUtils method mergeMovePaths.
* Merges the given Conditional input path and the linked MoveWork into one only MoveWork.
* This is an optimization for BlobStore systems to avoid doing two renames or copies that are not necessary.
* @param condInputPath A path that the ConditionalTask uses as input for its sub-tasks.
* @param linkedMoveWork A MoveWork that the ConditionalTask uses to link to its sub-tasks.
* @return A new MoveWork that has the Conditional input path as source and the linkedMoveWork as target.
protected static MoveWork mergeMovePaths(Path condInputPath, MoveWork linkedMoveWork) {
MoveWork newWork = new MoveWork(linkedMoveWork);
LoadFileDesc fileDesc = null;
LoadTableDesc tableDesc = null;
if (linkedMoveWork.getLoadFileWork() != null) {
fileDesc = new LoadFileDesc(linkedMoveWork.getLoadFileWork());
} else if (linkedMoveWork.getLoadTableWork() != null) {
tableDesc = new LoadTableDesc(linkedMoveWork.getLoadTableWork());
} else {
throw new IllegalArgumentException("Merging a path with a MoveWork with multi-files work is not allowed.");
return newWork;
use of org.apache.hadoop.hive.ql.plan.LoadFileDesc in project hive by apache.
the class MoveTask method execute.
public int execute(DriverContext driverContext) {
if (Utilities.FILE_OP_LOGGER.isTraceEnabled()) {
Utilities.FILE_OP_LOGGER.trace("Executing MoveWork " + System.identityHashCode(work) + " with " + work.getLoadFileWork() + "; " + work.getLoadTableWork() + "; " + work.getLoadMultiFilesWork());
try {
if (driverContext.getCtx().getExplainAnalyze() == AnalyzeState.RUNNING) {
return 0;
Hive db = getHive();
// Do any hive related operations like moving tables and files
// to appropriate locations
LoadFileDesc lfd = work.getLoadFileWork();
if (lfd != null) {
Path targetPath = lfd.getTargetDir();
Path sourcePath = lfd.getSourcePath();
if (targetPath.equals(sourcePath)) {
Utilities.FILE_OP_LOGGER.debug("MoveTask not moving " + sourcePath);
} else {
Utilities.FILE_OP_LOGGER.debug("MoveTask moving " + sourcePath + " to " + targetPath);
if (lfd.getWriteType() == AcidUtils.Operation.INSERT) {
// 'sourcePath' result of 'select ...' part of CTAS statement
assert lfd.getIsDfsDir();
FileSystem srcFs = sourcePath.getFileSystem(conf);
FileStatus[] srcs = srcFs.globStatus(sourcePath);
if (srcs != null) {
List<Path> newFiles = new ArrayList<>();
Hive.moveAcidFiles(srcFs, srcs, targetPath, newFiles);
} else {
LOG.debug("No files found to move from " + sourcePath + " to " + targetPath);
} else {
moveFile(sourcePath, targetPath, lfd.getIsDfsDir());
// Multi-file load is for dynamic partitions when some partitions do not
// need to merge and they can simply be moved to the target directory.
// This is also used for MM table conversion.
LoadMultiFilesDesc lmfd = work.getLoadMultiFilesWork();
if (lmfd != null) {
boolean isDfsDir = lmfd.getIsDfsDir();
List<String> targetPrefixes = lmfd.getTargetPrefixes();
for (int i = 0; i < lmfd.getSourceDirs().size(); ++i) {
Path srcPath = lmfd.getSourceDirs().get(i);
Path destPath = lmfd.getTargetDirs().get(i);
String filePrefix = targetPrefixes == null ? null : targetPrefixes.get(i);
FileSystem destFs = destPath.getFileSystem(conf);
if (filePrefix == null) {
if (!destFs.exists(destPath.getParent())) {
Utilities.FILE_OP_LOGGER.debug("MoveTask moving (multi-file) " + srcPath + " to " + destPath);
moveFile(srcPath, destPath, isDfsDir);
} else {
if (!destFs.exists(destPath)) {
FileSystem srcFs = srcPath.getFileSystem(conf);
FileStatus[] children = srcFs.listStatus(srcPath);
if (children != null) {
for (FileStatus child : children) {
Path childSrc = child.getPath();
Path childDest = new Path(destPath, filePrefix + childSrc.getName());
Utilities.FILE_OP_LOGGER.debug("MoveTask moving (multi-file) " + childSrc + " to " + childDest);
moveFile(childSrc, childDest, isDfsDir);
} else {
Utilities.FILE_OP_LOGGER.debug("MoveTask skipping empty directory (multi-file) " + srcPath);
if (!srcFs.delete(srcPath, false)) {
throw new IOException("Couldn't delete " + srcPath + " after moving all the files");
// Next we do this for tables and partitions
LoadTableDesc tbd = work.getLoadTableWork();
if (tbd != null) {
Table table = db.getTable(tbd.getTable().getTableName());
checkFileFormats(db, tbd, table);
boolean isFullAcidOp = work.getLoadTableWork().getWriteType() != AcidUtils.Operation.NOT_ACID && // it seems that LoadTableDesc has Operation.INSERT only for CTAS...
// Create a data container
DataContainer dc = null;
if (tbd.getPartitionSpec().size() == 0) {
dc = new DataContainer(table.getTTable());
if (Utilities.FILE_OP_LOGGER.isTraceEnabled()) {
Utilities.FILE_OP_LOGGER.trace("loadTable called from " + tbd.getSourcePath() + " into " + tbd.getTable().getTableName());
db.loadTable(tbd.getSourcePath(), tbd.getTable().getTableName(), tbd.getLoadFileType(), work.isSrcLocal(), isSkewedStoredAsDirs(tbd), isFullAcidOp, hasFollowingStatsTask(), tbd.getWriteId(), tbd.getStmtId());
if (work.getOutputs() != null) {
DDLTask.addIfAbsentByName(new WriteEntity(table, getWriteType(tbd, work.getLoadTableWork().getWriteType())), work.getOutputs());
} else {"Partition is: {}", tbd.getPartitionSpec());
// Check if the bucketing and/or sorting columns were inferred
TaskInformation ti = new TaskInformation(this, tbd.getSourcePath().toUri().toString());
// deal with dynamic partitions
DynamicPartitionCtx dpCtx = tbd.getDPCtx();
if (dpCtx != null && dpCtx.getNumDPCols() > 0) {
// dynamic partitions
dc = handleDynParts(db, table, tbd, ti, dpCtx);
} else {
// static partitions
dc = handleStaticParts(db, table, tbd, ti);
if (dc != null) {
// If we are doing an update or a delete the number of columns in the table will not
// match the number of columns in the file sink. For update there will be one too many
// (because of the ROW__ID), and in the case of the delete there will be just the
// ROW__ID, which we don't need to worry about from a lineage perspective.
List<FieldSchema> tableCols = null;
switch(work.getLoadTableWork().getWriteType()) {
case DELETE:
case UPDATE:
// Pass an empty list as no columns will be written to the file.
// TODO I should be able to make this work for update
tableCols = new ArrayList<>();
tableCols = table.getCols();
queryState.getLineageState().setLineage(tbd.getSourcePath(), dc, tableCols);
return 0;
} catch (HiveException he) {
int errorCode = 1;
if (he.getCanonicalErrorMsg() != ErrorMsg.GENERIC_ERROR) {
errorCode = he.getCanonicalErrorMsg().getErrorCode();
if (he.getCanonicalErrorMsg() == ErrorMsg.UNRESOLVED_RT_EXCEPTION) {
console.printError("Failed with exception " + he.getMessage(), "\n" + StringUtils.stringifyException(he));
} else {
console.printError("Failed with exception " + he.getMessage() + "\nRemote Exception: " + he.getRemoteErrorMsg());
console.printInfo("\n", StringUtils.stringifyException(he), false);
return errorCode;
} catch (Exception e) {
console.printError("Failed with exception " + e.getMessage(), "\n" + StringUtils.stringifyException(e));
return (1);
use of org.apache.hadoop.hive.ql.plan.LoadFileDesc in project hive by apache.
the class TestGenMapRedUtilsCreateConditionalTask method testMovePathsThatCanBeMerged.
public void testMovePathsThatCanBeMerged() {
final Path condInputPath = new Path("s3a://bucket/scratch/-ext-10000");
final Path condOutputPath = new Path("s3a://bucket/scratch/-ext-10002");
final Path targetMoveWorkPath = new Path("s3a://bucket/scratch/-ext-10003");
final MoveWork mockWork = mock(MoveWork.class);
when(mockWork.getLoadFileWork()).thenReturn(new LoadFileDesc(condOutputPath, targetMoveWorkPath, false, "", "", false));
assertTrue("Merging BlobStore paths should be allowed.", GenMapRedUtils.shouldMergeMovePaths(hiveConf, condInputPath, condOutputPath, mockWork));