use of org.apache.hadoop.hive.ql.plan.MoveWork in project hive by apache.
the class ImportSemanticAnalyzer method loadTable.
private static Task<?> loadTable(URI fromURI, Table table, boolean replace, Path tgtPath, ReplicationSpec replicationSpec, EximUtil.SemanticAnalyzerWrapperContext x, Long writeId, int stmtId, boolean isSourceMm) {
Path dataPath = new Path(fromURI.toString(), EximUtil.DATA_PATH_NAME);
Path destPath = null, loadPath = null;
LoadFileType lft;
if (AcidUtils.isInsertOnlyTable(table)) {
String mmSubdir = replace ? AcidUtils.baseDir(writeId) : AcidUtils.deltaSubdir(writeId, writeId, stmtId);
destPath = new Path(tgtPath, mmSubdir);
loadPath = tgtPath;
lft = LoadFileType.KEEP_EXISTING;
} else {
destPath = loadPath = x.getCtx().getExternalTmpPath(tgtPath);
lft = replace ? LoadFileType.REPLACE_ALL : LoadFileType.OVERWRITE_EXISTING;
}
if (Utilities.FILE_OP_LOGGER.isTraceEnabled()) {
Utilities.FILE_OP_LOGGER.trace("adding import work for table with source location: " + dataPath + "; table: " + tgtPath + "; copy destination " + destPath + "; mm " + writeId + " (src " + isSourceMm + ") for " + (table == null ? "a new table" : table.getTableName()));
}
Task<?> copyTask = null;
if (replicationSpec.isInReplicationScope()) {
if (isSourceMm || isAcid(writeId)) {
// Note: this is replication gap, not MM gap... Repl V2 is not ready yet.
throw new RuntimeException("Replicating MM and ACID tables is not supported");
}
copyTask = ReplCopyTask.getLoadCopyTask(replicationSpec, dataPath, destPath, x.getConf());
} else {
CopyWork cw = new CopyWork(dataPath, destPath, false);
cw.setSkipSourceMmDirs(isSourceMm);
copyTask = TaskFactory.get(cw);
}
LoadTableDesc loadTableWork = new LoadTableDesc(loadPath, Utilities.getTableDesc(table), new TreeMap<>(), lft, writeId);
loadTableWork.setStmtId(stmtId);
MoveWork mv = new MoveWork(x.getInputs(), x.getOutputs(), loadTableWork, null, false);
Task<?> loadTableTask = TaskFactory.get(mv);
copyTask.addDependentTask(loadTableTask);
x.getTasks().add(copyTask);
return loadTableTask;
}
use of org.apache.hadoop.hive.ql.plan.MoveWork in project hive by apache.
the class GenMapRedUtils method createMRWorkForMergingFiles.
/**
* @param fsInput The FileSink operator.
* @param finalName the final destination path the merge job should output.
* @param dependencyTask
* @param mvTasks
* @param conf
* @param currTask
* @param lineageState
* @throws SemanticException
*
* create a Map-only merge job using CombineHiveInputFormat for all partitions with
* following operators:
* MR job J0:
* ...
* |
* v
* FileSinkOperator_1 (fsInput)
* |
* v
* Merge job J1:
* |
* v
* TableScan (using CombineHiveInputFormat) (tsMerge)
* |
* v
* FileSinkOperator (fsMerge)
*
* Here the pathToPartitionInfo & pathToAlias will remain the same, which means the paths
* do
* not contain the dynamic partitions (their parent). So after the dynamic partitions are
* created (after the first job finished before the moveTask or ConditionalTask start),
* we need to change the pathToPartitionInfo & pathToAlias to include the dynamic
* partition
* directories.
*/
public static void createMRWorkForMergingFiles(FileSinkOperator fsInput, Path finalName, DependencyCollectionTask dependencyTask, List<Task<MoveWork>> mvTasks, HiveConf conf, Task<? extends Serializable> currTask, LineageState lineageState) throws SemanticException {
//
// 1. create the operator tree
//
FileSinkDesc fsInputDesc = fsInput.getConf();
if (Utilities.FILE_OP_LOGGER.isTraceEnabled()) {
Utilities.FILE_OP_LOGGER.trace("Creating merge work from " + System.identityHashCode(fsInput) + " with write ID " + (fsInputDesc.isMmTable() ? fsInputDesc.getTableWriteId() : null) + " into " + finalName);
}
boolean isBlockMerge = (conf.getBoolVar(ConfVars.HIVEMERGERCFILEBLOCKLEVEL) && fsInputDesc.getTableInfo().getInputFileFormatClass().equals(RCFileInputFormat.class)) || (conf.getBoolVar(ConfVars.HIVEMERGEORCFILESTRIPELEVEL) && fsInputDesc.getTableInfo().getInputFileFormatClass().equals(OrcInputFormat.class));
RowSchema inputRS = fsInput.getSchema();
Long srcMmWriteId = fsInputDesc.isMmTable() ? fsInputDesc.getTableWriteId() : null;
FileSinkDesc fsOutputDesc = null;
TableScanOperator tsMerge = null;
if (!isBlockMerge) {
// Create a TableScan operator
tsMerge = GenMapRedUtils.createTemporaryTableScanOperator(fsInput.getCompilationOpContext(), inputRS);
// Create a FileSink operator
TableDesc ts = (TableDesc) fsInputDesc.getTableInfo().clone();
Path mergeDest = srcMmWriteId == null ? finalName : finalName.getParent();
fsOutputDesc = new FileSinkDesc(mergeDest, ts, conf.getBoolVar(ConfVars.COMPRESSRESULT));
fsOutputDesc.setMmWriteId(srcMmWriteId);
fsOutputDesc.setIsMerge(true);
// Create and attach the filesink for the merge.
OperatorFactory.getAndMakeChild(fsOutputDesc, inputRS, tsMerge);
}
// If the input FileSinkOperator is a dynamic partition enabled, the tsMerge input schema
// needs to include the partition column, and the fsOutput should have
// a DynamicPartitionCtx to indicate that it needs to dynamically partitioned.
DynamicPartitionCtx dpCtx = fsInputDesc.getDynPartCtx();
if (dpCtx != null && dpCtx.getNumDPCols() > 0) {
// adding DP ColumnInfo to the RowSchema signature
ArrayList<ColumnInfo> signature = inputRS.getSignature();
String tblAlias = fsInputDesc.getTableInfo().getTableName();
for (String dpCol : dpCtx.getDPColNames()) {
ColumnInfo colInfo = new ColumnInfo(dpCol, // all partition column type should be string
TypeInfoFactory.stringTypeInfo, tblAlias, // partition column is virtual column
true);
signature.add(colInfo);
}
inputRS.setSignature(signature);
if (!isBlockMerge) {
// create another DynamicPartitionCtx, which has a different input-to-DP column mapping
DynamicPartitionCtx dpCtx2 = new DynamicPartitionCtx(dpCtx);
fsOutputDesc.setDynPartCtx(dpCtx2);
}
// update the FileSinkOperator to include partition columns
usePartitionColumns(fsInputDesc.getTableInfo().getProperties(), dpCtx.getDPColNames());
} else {
// non-partitioned table
fsInputDesc.getTableInfo().getProperties().remove(org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.META_TABLE_PARTITION_COLUMNS);
}
//
// 2. Constructing a conditional task consisting of a move task and a map reduce task
//
Path inputDirName = fsInputDesc.getMergeInputDirName();
MapWork cplan;
Serializable work;
if (isBlockMerge) {
cplan = GenMapRedUtils.createMergeTask(fsInputDesc, finalName, dpCtx != null && dpCtx.getNumDPCols() > 0, fsInput.getCompilationOpContext());
if (conf.getVar(ConfVars.HIVE_EXECUTION_ENGINE).equals("tez")) {
work = new TezWork(conf.getVar(HiveConf.ConfVars.HIVEQUERYID), conf);
cplan.setName("File Merge");
((TezWork) work).add(cplan);
} else if (conf.getVar(ConfVars.HIVE_EXECUTION_ENGINE).equals("spark")) {
work = new SparkWork(conf.getVar(HiveConf.ConfVars.HIVEQUERYID));
cplan.setName("Spark Merge File Work");
((SparkWork) work).add(cplan);
} else {
work = cplan;
}
} else {
cplan = createMRWorkForMergingFiles(conf, tsMerge, fsInputDesc);
if (conf.getVar(ConfVars.HIVE_EXECUTION_ENGINE).equals("tez")) {
work = new TezWork(conf.getVar(HiveConf.ConfVars.HIVEQUERYID), conf);
cplan.setName("File Merge");
((TezWork) work).add(cplan);
} else if (conf.getVar(ConfVars.HIVE_EXECUTION_ENGINE).equals("spark")) {
work = new SparkWork(conf.getVar(HiveConf.ConfVars.HIVEQUERYID));
cplan.setName("Spark Merge File Work");
((SparkWork) work).add(cplan);
} else {
work = new MapredWork();
((MapredWork) work).setMapWork(cplan);
}
}
// use CombineHiveInputFormat for map-only merging
cplan.setInputformat("org.apache.hadoop.hive.ql.io.CombineHiveInputFormat");
// NOTE: we should gather stats in MR1 rather than MR2 at merge job since we don't
// know if merge MR2 will be triggered at execution time
MoveWork dummyMv = null;
if (srcMmWriteId == null) {
// Only create the movework for non-MM table. No action needed for a MM table.
dummyMv = new MoveWork(null, null, null, new LoadFileDesc(inputDirName, finalName, true, null, null, false), false);
}
// Use the original fsOp path here in case of MM - while the new FSOP merges files inside the
// MM directory, the original MoveTask still commits based on the parent. Note that this path
// can only be triggered for a merge that's part of insert for now; MM tables do not support
// concatenate. Keeping the old logic for non-MM tables with temp directories and stuff.
Path fsopPath = srcMmWriteId != null ? fsInputDesc.getFinalDirName() : finalName;
Task<MoveWork> mvTask = GenMapRedUtils.findMoveTaskForFsopOutput(mvTasks, fsopPath, fsInputDesc.isMmTable());
ConditionalTask cndTsk = GenMapRedUtils.createCondTask(conf, currTask, dummyMv, work, fsInputDesc.getMergeInputDirName(), finalName, mvTask, dependencyTask, lineageState);
// keep the dynamic partition context in conditional task resolver context
ConditionalResolverMergeFilesCtx mrCtx = (ConditionalResolverMergeFilesCtx) cndTsk.getResolverCtx();
mrCtx.setDPCtx(fsInputDesc.getDynPartCtx());
mrCtx.setLbCtx(fsInputDesc.getLbCtx());
}
use of org.apache.hadoop.hive.ql.plan.MoveWork in project hive by apache.
the class GenMapRedUtils method mergeMovePaths.
/**
* Merges the given Conditional input path and the linked MoveWork into one only MoveWork.
* This is an optimization for BlobStore systems to avoid doing two renames or copies that are not necessary.
*
* @param condInputPath A path that the ConditionalTask uses as input for its sub-tasks.
* @param linkedMoveWork A MoveWork that the ConditionalTask uses to link to its sub-tasks.
* @param lineageState A LineageState used to track what changes.
* @return A new MoveWork that has the Conditional input path as source and the linkedMoveWork as target.
*/
@VisibleForTesting
protected static MoveWork mergeMovePaths(Path condInputPath, MoveWork linkedMoveWork, LineageState lineageState) {
MoveWork newWork = new MoveWork(linkedMoveWork);
LoadFileDesc fileDesc = null;
LoadTableDesc tableDesc = null;
if (linkedMoveWork.getLoadFileWork() != null) {
fileDesc = new LoadFileDesc(linkedMoveWork.getLoadFileWork());
fileDesc.setSourcePath(condInputPath);
lineageState.updateDirToOpMap(condInputPath, linkedMoveWork.getLoadFileWork().getSourcePath());
} else if (linkedMoveWork.getLoadTableWork() != null) {
tableDesc = new LoadTableDesc(linkedMoveWork.getLoadTableWork());
tableDesc.setSourcePath(condInputPath);
lineageState.updateDirToOpMap(condInputPath, linkedMoveWork.getLoadTableWork().getSourcePath());
} else {
throw new IllegalArgumentException("Merging a path with a MoveWork with multi-files work is not allowed.");
}
newWork.setLoadFileWork(fileDesc);
newWork.setLoadTableWork(tableDesc);
return newWork;
}
use of org.apache.hadoop.hive.ql.plan.MoveWork in project hive by apache.
the class GenMapRedUtils method createMoveTask.
/**
* Create and add any dependent move tasks
*
* @param currTask
* @param chDir
* @param fsOp
* @param parseCtx
* @param mvTasks
* @param hconf
* @param dependencyTask
* @return
*/
public static Path createMoveTask(Task<? extends Serializable> currTask, boolean chDir, FileSinkOperator fsOp, ParseContext parseCtx, List<Task<MoveWork>> mvTasks, HiveConf hconf, DependencyCollectionTask dependencyTask) {
Path dest = null;
FileSinkDesc fileSinkDesc = fsOp.getConf();
boolean isMmTable = fileSinkDesc.isMmTable();
if (chDir) {
dest = fileSinkDesc.getMergeInputDirName();
if (!isMmTable) {
// generate the temporary file
// it must be on the same file system as the current destination
Context baseCtx = parseCtx.getContext();
// Create the required temporary file in the HDFS location if the destination
// path of the FileSinkOperator table is a blobstore path.
Path tmpDir = baseCtx.getTempDirForFinalJobPath(fileSinkDesc.getDestPath());
// Change all the linked file sink descriptors
if (fileSinkDesc.isLinkedFileSink()) {
for (FileSinkDesc fsConf : fileSinkDesc.getLinkedFileSinkDesc()) {
fsConf.setDirName(new Path(tmpDir, fsConf.getDirName().getName()));
if (Utilities.FILE_OP_LOGGER.isTraceEnabled()) {
Utilities.FILE_OP_LOGGER.trace("createMoveTask setting tmpDir for LinkedFileSink chDir " + fsConf.getDirName() + "; dest was " + fileSinkDesc.getDestPath());
}
}
} else {
fileSinkDesc.setDirName(tmpDir);
if (Utilities.FILE_OP_LOGGER.isTraceEnabled()) {
Utilities.FILE_OP_LOGGER.trace("createMoveTask setting tmpDir chDir " + tmpDir + "; dest was " + fileSinkDesc.getDestPath());
}
}
}
}
Task<MoveWork> mvTask = null;
if (!chDir) {
mvTask = GenMapRedUtils.findMoveTaskForFsopOutput(mvTasks, fsOp.getConf().getFinalDirName(), fsOp.getConf().isMmTable());
}
// Set the move task to be dependent on the current task
if (mvTask != null) {
GenMapRedUtils.addDependentMoveTasks(mvTask, hconf, currTask, dependencyTask);
}
return dest;
}
use of org.apache.hadoop.hive.ql.plan.MoveWork in project hive by apache.
the class GenMapRedUtils method addStatsTask.
/**
* Add the StatsTask as a dependent task of the MoveTask
* because StatsTask will change the Table/Partition metadata. For atomicity, we
* should not change it before the data is actually there done by MoveTask.
*
* @param nd
* the FileSinkOperator whose results are taken care of by the MoveTask.
* @param mvTask
* The MoveTask that moves the FileSinkOperator's results.
* @param currTask
* The MapRedTask that the FileSinkOperator belongs to.
* @param hconf
* HiveConf
*/
public static void addStatsTask(FileSinkOperator nd, MoveTask mvTask, Task<? extends Serializable> currTask, HiveConf hconf) {
MoveWork mvWork = mvTask.getWork();
BasicStatsWork statsWork = null;
Table table = null;
boolean truncate = false;
if (mvWork.getLoadTableWork() != null) {
statsWork = new BasicStatsWork(mvWork.getLoadTableWork());
String tableName = mvWork.getLoadTableWork().getTable().getTableName();
truncate = mvWork.getLoadTableWork().getReplace();
try {
table = Hive.get().getTable(SessionState.get().getCurrentDatabase(), tableName);
} catch (HiveException e) {
throw new RuntimeException("unexpected; table should be present already..: " + tableName, e);
}
} else if (mvWork.getLoadFileWork() != null) {
statsWork = new BasicStatsWork(mvWork.getLoadFileWork());
truncate = true;
if (mvWork.getLoadFileWork().getCtasCreateTableDesc() != null) {
try {
table = mvWork.getLoadFileWork().getCtasCreateTableDesc().toTable(hconf);
} catch (HiveException e) {
LOG.debug("can't pre-create table for CTAS", e);
table = null;
}
} else if (mvWork.getLoadFileWork().getCreateViewDesc() != null) {
// CREATE MATERIALIZED VIEW ...
try {
table = mvWork.getLoadFileWork().getCreateViewDesc().toTable(hconf);
} catch (HiveException e) {
LOG.debug("can't pre-create table for MV", e);
table = null;
}
} else {
throw new RuntimeException("unexpected; this should be a CTAS or a CREATE/REBUILD MV - however no desc present");
}
}
assert statsWork != null : "Error when generating StatsTask";
if (currTask.getWork() instanceof MapredWork) {
MapredWork mrWork = (MapredWork) currTask.getWork();
mrWork.getMapWork().setGatheringStats(true);
if (mrWork.getReduceWork() != null) {
mrWork.getReduceWork().setGatheringStats(true);
}
} else if (currTask.getWork() instanceof SparkWork) {
SparkWork work = (SparkWork) currTask.getWork();
for (BaseWork w : work.getAllWork()) {
w.setGatheringStats(true);
}
} else {
// must be TezWork
TezWork work = (TezWork) currTask.getWork();
for (BaseWork w : work.getAllWork()) {
w.setGatheringStats(true);
}
}
StatsWork columnStatsWork = new StatsWork(table, statsWork, hconf);
columnStatsWork.collectStatsFromAggregator(nd.getConf());
columnStatsWork.truncateExisting(truncate);
columnStatsWork.setSourceTask(currTask);
Task<? extends Serializable> statsTask = TaskFactory.get(columnStatsWork);
// subscribe feeds from the MoveTask so that MoveTask can forward the list
// of dynamic partition list to the StatsTask
mvTask.addDependentTask(statsTask);
statsTask.subscribeFeed(mvTask);
}
Aggregations