use of org.apache.hadoop.hive.ql.plan.MoveWork in project hive by apache.
the class DDLTask method generateAddMmTasks.
private List<Task<?>> generateAddMmTasks(Table tbl) throws HiveException {
// We will move all the files in the table/partition directories into the first MM
// directory, then commit the first write ID.
List<Path> srcs = new ArrayList<>(), tgts = new ArrayList<>();
long mmWriteId = 0;
try {
HiveTxnManager txnManager = SessionState.get().getTxnMgr();
if (txnManager.isTxnOpen()) {
mmWriteId = txnManager.getTableWriteId(tbl.getDbName(), tbl.getTableName());
} else {
txnManager.openTxn(new Context(conf), conf.getUser());
mmWriteId = txnManager.getTableWriteId(tbl.getDbName(), tbl.getTableName());
txnManager.commitTxn();
}
} catch (Exception e) {
String errorMessage = "FAILED: Error in acquiring locks: " + e.getMessage();
console.printError(errorMessage, "\n" + org.apache.hadoop.util.StringUtils.stringifyException(e));
}
int stmtId = 0;
String mmDir = AcidUtils.deltaSubdir(mmWriteId, mmWriteId, stmtId);
Hive db = getHive();
if (tbl.getPartitionKeys().size() > 0) {
PartitionIterable parts = new PartitionIterable(db, tbl, null, HiveConf.getIntVar(conf, ConfVars.METASTORE_BATCH_RETRIEVE_MAX));
Iterator<Partition> partIter = parts.iterator();
while (partIter.hasNext()) {
Partition part = partIter.next();
checkMmLb(part);
Path src = part.getDataLocation(), tgt = new Path(src, mmDir);
srcs.add(src);
tgts.add(tgt);
if (Utilities.FILE_OP_LOGGER.isTraceEnabled()) {
Utilities.FILE_OP_LOGGER.trace("Will move " + src + " to " + tgt);
}
}
} else {
checkMmLb(tbl);
Path src = tbl.getDataLocation(), tgt = new Path(src, mmDir);
srcs.add(src);
tgts.add(tgt);
if (Utilities.FILE_OP_LOGGER.isTraceEnabled()) {
Utilities.FILE_OP_LOGGER.trace("Will move " + src + " to " + tgt);
}
}
// Don't set inputs and outputs - the locks have already been taken so it's pointless.
MoveWork mw = new MoveWork(null, null, null, null, false);
mw.setMultiFilesDesc(new LoadMultiFilesDesc(srcs, tgts, true, null, null));
ImportCommitWork icw = new ImportCommitWork(tbl.getDbName(), tbl.getTableName(), mmWriteId, stmtId);
Task<?> mv = TaskFactory.get(mw), ic = TaskFactory.get(icw);
mv.addDependentTask(ic);
return Lists.<Task<?>>newArrayList(mv);
}
use of org.apache.hadoop.hive.ql.plan.MoveWork in project hive by apache.
the class LoadTable method loadTableTask.
private Task<?> loadTableTask(Table table, ReplicationSpec replicationSpec, Path tgtPath, Path fromURI) {
Path dataPath = new Path(fromURI, EximUtil.DATA_PATH_NAME);
Path tmpPath = PathUtils.getExternalTmpPath(tgtPath, context.pathInfo);
Task<?> copyTask = ReplCopyTask.getLoadCopyTask(replicationSpec, dataPath, tmpPath, context.hiveConf);
LoadTableDesc loadTableWork = new LoadTableDesc(tmpPath, Utilities.getTableDesc(table), new TreeMap<>(), replicationSpec.isReplace() ? LoadFileType.REPLACE_ALL : LoadFileType.OVERWRITE_EXISTING, // todo: what is the point of this? If this is for replication, who would have opened a txn?
SessionState.get().getTxnMgr().getCurrentTxnId());
MoveWork moveWork = new MoveWork(new HashSet<>(), new HashSet<>(), loadTableWork, null, false);
Task<?> loadTableTask = TaskFactory.get(moveWork, context.hiveConf);
copyTask.addDependentTask(loadTableTask);
return copyTask;
}
use of org.apache.hadoop.hive.ql.plan.MoveWork in project hive by apache.
the class TestGenMapRedUtilsCreateConditionalTask method testMovePathsThatCannotBeMerged.
@Test
public void testMovePathsThatCannotBeMerged() {
final Path condInputPath = new Path("s3a://bucket/scratch/-ext-10000");
final Path condOutputPath = new Path("s3a://bucket/scratch/-ext-10002");
final MoveWork mockWork = mock(MoveWork.class);
assertFalse("A MoveWork null object cannot be merged.", GenMapRedUtils.shouldMergeMovePaths(hiveConf, condInputPath, condOutputPath, null));
hiveConf.set(HiveConf.ConfVars.HIVE_BLOBSTORE_OPTIMIZATIONS_ENABLED.varname, "false");
assertFalse("Merging paths is not allowed when BlobStorage optimizations are disabled.", GenMapRedUtils.shouldMergeMovePaths(hiveConf, condInputPath, condOutputPath, mockWork));
// Enable BlobStore optimizations for the rest of tests
hiveConf.set(HiveConf.ConfVars.HIVE_BLOBSTORE_OPTIMIZATIONS_ENABLED.varname, "true");
reset(mockWork);
when(mockWork.getLoadMultiFilesWork()).thenReturn(new LoadMultiFilesDesc());
assertFalse("Merging paths is not allowed when MultiFileWork is found in the MoveWork object.", GenMapRedUtils.shouldMergeMovePaths(hiveConf, condInputPath, condOutputPath, mockWork));
reset(mockWork);
when(mockWork.getLoadFileWork()).thenReturn(mock(LoadFileDesc.class));
when(mockWork.getLoadTableWork()).thenReturn(mock(LoadTableDesc.class));
assertFalse("Merging paths is not allowed when both LoadFileWork & LoadTableWork are found in the MoveWork object.", GenMapRedUtils.shouldMergeMovePaths(hiveConf, condInputPath, condOutputPath, mockWork));
reset(mockWork);
when(mockWork.getLoadFileWork()).thenReturn(new LoadFileDesc(condInputPath, condOutputPath, false, "", "", false));
assertFalse("Merging paths is not allowed when both conditional output path is not equals to MoveWork input path.", GenMapRedUtils.shouldMergeMovePaths(hiveConf, condInputPath, condOutputPath, mockWork));
reset(mockWork);
when(mockWork.getLoadFileWork()).thenReturn(new LoadFileDesc(condOutputPath, new Path("unused"), false, "", "", false));
assertFalse("Merging paths is not allowed when conditional input path is not a BlobStore path.", GenMapRedUtils.shouldMergeMovePaths(hiveConf, new Path("hdfs://hdfs-path"), condOutputPath, mockWork));
reset(mockWork);
when(mockWork.getLoadFileWork()).thenReturn(new LoadFileDesc(condOutputPath, new Path("hdfs://hdfs-path"), false, "", "", false));
assertFalse("Merging paths is not allowed when MoveWork output path is not a BlobStore path.", GenMapRedUtils.shouldMergeMovePaths(hiveConf, condInputPath, condOutputPath, mockWork));
}
use of org.apache.hadoop.hive.ql.plan.MoveWork in project hive by apache.
the class TestGenMapRedUtilsCreateConditionalTask method testConditionalMoveTaskIsNotOptimized.
@Test
public void testConditionalMoveTaskIsNotOptimized() throws SemanticException {
hiveConf.set(HiveConf.ConfVars.HIVE_BLOBSTORE_OPTIMIZATIONS_ENABLED.varname, "false");
Path sinkDirName = new Path("s3a://bucket/scratch/-ext-10002");
FileSinkOperator fileSinkOperator = createFileSinkOperator(sinkDirName);
Path finalDirName = new Path("s3a://bucket/scratch/-ext-10000");
Path tableLocation = new Path("s3a://bucket/warehouse/table");
Task<MoveWork> moveTask = createMoveTask(finalDirName, tableLocation);
List<Task<MoveWork>> moveTaskList = Collections.singletonList(moveTask);
GenMapRedUtils.createMRWorkForMergingFiles(fileSinkOperator, finalDirName, null, moveTaskList, hiveConf, dummyMRTask, new LineageState());
ConditionalTask conditionalTask = (ConditionalTask) dummyMRTask.getChildTasks().get(0);
Task<? extends Serializable> moveOnlyTask = conditionalTask.getListTasks().get(0);
Task<? extends Serializable> mergeOnlyTask = conditionalTask.getListTasks().get(1);
Task<? extends Serializable> mergeAndMoveTask = conditionalTask.getListTasks().get(2);
// Verify moveOnlyTask is NOT optimized
assertEquals(1, moveOnlyTask.getChildTasks().size());
verifyMoveTask(moveOnlyTask, sinkDirName, finalDirName);
verifyMoveTask(moveOnlyTask.getChildTasks().get(0), finalDirName, tableLocation);
// Verify mergeOnlyTask is NOT optimized
assertEquals(1, mergeOnlyTask.getChildTasks().size());
verifyMoveTask(mergeOnlyTask.getChildTasks().get(0), finalDirName, tableLocation);
// Verify mergeAndMoveTask is NOT optimized
assertEquals(1, mergeAndMoveTask.getChildTasks().size());
assertEquals(1, mergeAndMoveTask.getChildTasks().get(0).getChildTasks().size());
verifyMoveTask(mergeAndMoveTask.getChildTasks().get(0), sinkDirName, finalDirName);
verifyMoveTask(mergeAndMoveTask.getChildTasks().get(0).getChildTasks().get(0), finalDirName, tableLocation);
}
use of org.apache.hadoop.hive.ql.plan.MoveWork in project hive by apache.
the class TestGenMapRedUtilsCreateConditionalTask method testConditionalMoveTaskIsOptimized.
@Test
public void testConditionalMoveTaskIsOptimized() throws SemanticException {
hiveConf.set(HiveConf.ConfVars.HIVE_BLOBSTORE_OPTIMIZATIONS_ENABLED.varname, "true");
Path sinkDirName = new Path("s3a://bucket/scratch/-ext-10002");
FileSinkOperator fileSinkOperator = createFileSinkOperator(sinkDirName);
Path finalDirName = new Path("s3a://bucket/scratch/-ext-10000");
Path tableLocation = new Path("s3a://bucket/warehouse/table");
Task<MoveWork> moveTask = createMoveTask(finalDirName, tableLocation);
List<Task<MoveWork>> moveTaskList = Collections.singletonList(moveTask);
GenMapRedUtils.createMRWorkForMergingFiles(fileSinkOperator, finalDirName, null, moveTaskList, hiveConf, dummyMRTask, new LineageState());
ConditionalTask conditionalTask = (ConditionalTask) dummyMRTask.getChildTasks().get(0);
Task<? extends Serializable> moveOnlyTask = conditionalTask.getListTasks().get(0);
Task<? extends Serializable> mergeOnlyTask = conditionalTask.getListTasks().get(1);
Task<? extends Serializable> mergeAndMoveTask = conditionalTask.getListTasks().get(2);
/*
* OPTIMIZATION
* The ConditionalTask avoids linking 2 MoveTask that are expensive on blobstorage systems. Instead of
* linking, it creates one MoveTask where the source is the first MoveTask source, and target is the
* second MoveTask target.
*/
// Verify moveOnlyTask is optimized
assertNull(moveOnlyTask.getChildTasks());
verifyMoveTask(moveOnlyTask, sinkDirName, tableLocation);
// Verify mergeOnlyTask is NOT optimized (a merge task writes directly to finalDirName, then a MoveTask is executed)
assertEquals(1, mergeOnlyTask.getChildTasks().size());
verifyMoveTask(mergeOnlyTask.getChildTasks().get(0), finalDirName, tableLocation);
// Verify mergeAndMoveTask is NOT optimized
assertEquals(1, mergeAndMoveTask.getChildTasks().size());
assertEquals(1, mergeAndMoveTask.getChildTasks().get(0).getChildTasks().size());
verifyMoveTask(mergeAndMoveTask.getChildTasks().get(0), sinkDirName, finalDirName);
verifyMoveTask(mergeAndMoveTask.getChildTasks().get(0).getChildTasks().get(0), finalDirName, tableLocation);
}
Aggregations