use of org.apache.hadoop.hive.ql.plan.MoveWork in project hive by apache.
the class GenSparkUtils method createMoveTask.
/**
* Create and add any dependent move tasks.
*
* This is forked from {@link GenMapRedUtils}. The difference is that it doesn't check
* 'isLinkedFileSink' and does not set parent dir for the linked file sinks.
*/
public static Path createMoveTask(Task<? extends Serializable> currTask, boolean chDir, FileSinkOperator fsOp, ParseContext parseCtx, List<Task<MoveWork>> mvTasks, HiveConf hconf, DependencyCollectionTask dependencyTask) {
Path dest = null;
FileSinkDesc fileSinkDesc = fsOp.getConf();
if (chDir) {
dest = fsOp.getConf().getFinalDirName();
// generate the temporary file
// it must be on the same file system as the current destination
Context baseCtx = parseCtx.getContext();
Path tmpDir = baseCtx.getExternalTmpPath(dest);
// Change all the linked file sink descriptors
if (fileSinkDesc.getLinkedFileSinkDesc() != null) {
for (FileSinkDesc fsConf : fileSinkDesc.getLinkedFileSinkDesc()) {
fsConf.setDirName(tmpDir);
}
} else {
fileSinkDesc.setDirName(tmpDir);
}
}
Task<MoveWork> mvTask = null;
if (!chDir) {
mvTask = GenMapRedUtils.findMoveTaskForFsopOutput(mvTasks, fileSinkDesc.getFinalDirName(), false);
}
// Set the move task to be dependent on the current task
if (mvTask != null) {
GenMapRedUtils.addDependentMoveTasks(mvTask, hconf, currTask, dependencyTask);
}
return dest;
}
use of org.apache.hadoop.hive.ql.plan.MoveWork in project hive by apache.
the class GenSparkUtils method createMoveTask.
/**
* Create and add any dependent move tasks.
*
* This is forked from {@link GenMapRedUtils}. The difference is that it doesn't check
* 'isLinkedFileSink' and does not set parent dir for the linked file sinks.
*/
public static Path createMoveTask(Task<?> currTask, boolean chDir, FileSinkOperator fsOp, ParseContext parseCtx, List<Task<MoveWork>> mvTasks, HiveConf hconf, DependencyCollectionTask dependencyTask) {
Path dest = null;
FileSinkDesc fileSinkDesc = fsOp.getConf();
if (chDir) {
dest = fsOp.getConf().getFinalDirName();
// generate the temporary file
// it must be on the same file system as the current destination
Context baseCtx = parseCtx.getContext();
Path tmpDir = null;
// In such case we shouldn't request an external tmp dir as it will end up inside the mr temp dir
if (baseCtx.isMRTmpFileURI(dest.toUri().getPath())) {
tmpDir = baseCtx.getMRTmpPath();
} else {
tmpDir = baseCtx.getExternalTmpPath(dest);
}
// Change all the linked file sink descriptors
if (fileSinkDesc.getLinkedFileSinkDesc() != null) {
for (FileSinkDesc fsConf : fileSinkDesc.getLinkedFileSinkDesc()) {
fsConf.setDirName(tmpDir);
}
} else {
fileSinkDesc.setDirName(tmpDir);
}
}
Task<MoveWork> mvTask = null;
if (!chDir) {
mvTask = GenMapRedUtils.findMoveTaskForFsopOutput(mvTasks, fileSinkDesc.getFinalDirName(), false, fileSinkDesc.isDirectInsert(), fileSinkDesc.getMoveTaskId(), fileSinkDesc.getAcidOperation());
}
// Set the move task to be dependent on the current task
if (mvTask != null) {
GenMapRedUtils.addDependentMoveTasks(mvTask, hconf, currTask, dependencyTask);
}
return dest;
}
use of org.apache.hadoop.hive.ql.plan.MoveWork in project hive by apache.
the class GenMapRedUtils method findMoveTaskForFsopOutput.
public static Task<MoveWork> findMoveTaskForFsopOutput(List<Task<MoveWork>> mvTasks, Path fsopFinalDir, boolean isMmFsop, boolean isDirectInsert, String fsoMoveTaskId, AcidUtils.Operation acidOperation) {
// find the move task
for (Task<MoveWork> mvTsk : mvTasks) {
MoveWork mvWork = mvTsk.getWork();
Path srcDir = null;
boolean isLfd = false;
String moveTaskId = null;
AcidUtils.Operation moveTaskWriteType = null;
if (mvWork.getLoadFileWork() != null) {
srcDir = mvWork.getLoadFileWork().getSourcePath();
isLfd = true;
if (isMmFsop || isDirectInsert) {
srcDir = srcDir.getParent();
}
moveTaskId = mvWork.getLoadFileWork().getMoveTaskId();
moveTaskWriteType = mvWork.getLoadFileWork().getWriteType();
} else if (mvWork.getLoadTableWork() != null) {
srcDir = mvWork.getLoadTableWork().getSourcePath();
moveTaskId = mvWork.getLoadTableWork().getMoveTaskId();
moveTaskWriteType = mvWork.getLoadTableWork().getWriteType();
}
if (Utilities.FILE_OP_LOGGER.isTraceEnabled()) {
Utilities.FILE_OP_LOGGER.trace("Observing MoveWork " + System.identityHashCode(mvWork) + " with " + srcDir + "(from " + (isLfd ? "LFD" : "LTD") + ") while looking for " + fsopFinalDir + "(mm = " + isMmFsop + ")");
}
if ((srcDir != null) && srcDir.equals(fsopFinalDir)) {
if (isDirectInsert || isMmFsop) {
if (moveTaskId.equals(fsoMoveTaskId)) {
// We need to consider the ACID write type to identify the MoveTasks.
return mvTsk;
}
} else {
return mvTsk;
}
}
}
return null;
}
use of org.apache.hadoop.hive.ql.plan.MoveWork in project hive by apache.
the class GenMapRedUtils method createMoveTask.
/**
* Create and add any dependent move tasks
*
* @param currTask
* @param chDir
* @param fsOp
* @param parseCtx
* @param mvTasks
* @param hconf
* @param dependencyTask
* @return
*/
public static Path createMoveTask(Task<?> currTask, boolean chDir, FileSinkOperator fsOp, ParseContext parseCtx, List<Task<MoveWork>> mvTasks, HiveConf hconf, DependencyCollectionTask dependencyTask) {
Path dest = null;
FileSinkDesc fileSinkDesc = fsOp.getConf();
boolean isMmTable = fileSinkDesc.isMmTable();
boolean isDirectInsert = fileSinkDesc.isDirectInsert();
if (chDir) {
dest = fileSinkDesc.getMergeInputDirName();
/**
* Skip temporary file generation for:
* 1. MM Tables
* 2. INSERT operation on full ACID table
*/
if (!isMmTable && !isDirectInsert) {
// generate the temporary file
// it must be on the same file system as the current destination
Context baseCtx = parseCtx.getContext();
// Create the required temporary file in the HDFS location if the destination
// path of the FileSinkOperator table is a blobstore path.
Path tmpDir = baseCtx.getTempDirForFinalJobPath(fileSinkDesc.getDestPath());
// Change all the linked file sink descriptors
if (fileSinkDesc.isLinkedFileSink()) {
for (FileSinkDesc fsConf : fileSinkDesc.getLinkedFileSinkDesc()) {
fsConf.setDirName(new Path(tmpDir, fsConf.getDirName().getName()));
if (Utilities.FILE_OP_LOGGER.isTraceEnabled()) {
Utilities.FILE_OP_LOGGER.trace("createMoveTask setting tmpDir for LinkedFileSink chDir " + fsConf.getDirName() + "; dest was " + fileSinkDesc.getDestPath());
}
}
} else {
fileSinkDesc.setDirName(tmpDir);
if (Utilities.FILE_OP_LOGGER.isTraceEnabled()) {
Utilities.FILE_OP_LOGGER.trace("createMoveTask setting tmpDir chDir " + tmpDir + "; dest was " + fileSinkDesc.getDestPath());
}
}
}
}
Task<MoveWork> mvTask = null;
if (!chDir) {
mvTask = GenMapRedUtils.findMoveTaskForFsopOutput(mvTasks, fsOp.getConf().getFinalDirName(), isMmTable, isDirectInsert, fsOp.getConf().getMoveTaskId(), fsOp.getConf().getAcidOperation());
}
// Set the move task to be dependent on the current task
if (mvTask != null) {
GenMapRedUtils.addDependentMoveTasks(mvTask, hconf, currTask, dependencyTask);
}
return dest;
}
use of org.apache.hadoop.hive.ql.plan.MoveWork in project hive by apache.
the class LoadSemanticAnalyzer method analyzeLoad.
private void analyzeLoad(ASTNode ast) throws SemanticException {
fromTree = ast.getChild(0);
tableTree = ast.getChild(1);
boolean inputInfo = false;
// Check the last node
ASTNode child = (ASTNode) ast.getChild(ast.getChildCount() - 1);
if (child.getToken().getType() == HiveParser.TOK_INPUTFORMAT) {
if (child.getChildCount() != 2) {
throw new SemanticException("FileFormat should contain both input format and Serde");
}
try {
inputFormatClassName = stripQuotes(child.getChild(0).getText());
serDeClassName = stripQuotes(child.getChild(1).getText());
inputInfo = true;
} catch (Exception e) {
throw new SemanticException("FileFormat inputFormatClassName or serDeClassName is incorrect");
}
}
if ((!inputInfo && ast.getChildCount() == 4) || (inputInfo && ast.getChildCount() == 5)) {
isLocal = true;
isOverWrite = true;
}
if ((!inputInfo && ast.getChildCount() == 3) || (inputInfo && ast.getChildCount() == 4)) {
if (ast.getChild(2).getText().toLowerCase().equals("local")) {
isLocal = true;
} else {
isOverWrite = true;
}
}
// initialize load path
URI fromURI;
try {
String fromPath = stripQuotes(fromTree.getText());
fromURI = initializeFromURI(fromPath, isLocal);
} catch (IOException | URISyntaxException e) {
throw new SemanticException(ASTErrorUtils.getMsg(ErrorMsg.INVALID_PATH.getMsg(), fromTree, e.getMessage()), e);
}
// initialize destination table/partition
TableSpec ts = new TableSpec(db, conf, (ASTNode) tableTree);
if (ts.tableHandle.isView() || ts.tableHandle.isMaterializedView()) {
throw new SemanticException(ErrorMsg.DML_AGAINST_VIEW.getMsg());
}
if (ts.tableHandle.isNonNative()) {
throw new SemanticException(ErrorMsg.LOAD_INTO_NON_NATIVE.getMsg());
}
if (ts.tableHandle.isStoredAsSubDirectories()) {
throw new SemanticException(ErrorMsg.LOAD_INTO_STORED_AS_DIR.getMsg());
}
List<FieldSchema> parts = ts.tableHandle.getPartitionKeys();
if ((parts != null && parts.size() > 0) && (ts.partSpec == null || ts.partSpec.size() == 0)) {
// launch a tez job
reparseAndSuperAnalyze(ts.tableHandle, fromURI);
return;
}
List<String> bucketCols = ts.tableHandle.getBucketCols();
if (bucketCols != null && !bucketCols.isEmpty()) {
String error = StrictChecks.checkBucketing(conf);
if (error != null) {
// launch a tez job
reparseAndSuperAnalyze(ts.tableHandle, fromURI);
return;
}
}
// make sure the arguments make sense
List<FileStatus> files = applyConstraintsAndGetFiles(fromURI, ts.tableHandle);
if (queryReWritten) {
return;
}
// for managed tables, make sure the file formats match
if (TableType.MANAGED_TABLE.equals(ts.tableHandle.getTableType()) && conf.getBoolVar(HiveConf.ConfVars.HIVECHECKFILEFORMAT)) {
ensureFileFormatsMatch(ts, files, fromURI);
}
inputs.add(toReadEntity(new Path(fromURI)));
// create final load/move work
boolean preservePartitionSpecs = false;
Map<String, String> partSpec = ts.getPartSpec();
if (partSpec == null) {
partSpec = new LinkedHashMap<String, String>();
outputs.add(new WriteEntity(ts.tableHandle, (isOverWrite ? WriteEntity.WriteType.INSERT_OVERWRITE : WriteEntity.WriteType.INSERT)));
} else {
try {
Partition part = Hive.get().getPartition(ts.tableHandle, partSpec, false);
if (part != null) {
if (isOverWrite) {
outputs.add(new WriteEntity(part, WriteEntity.WriteType.INSERT_OVERWRITE));
} else {
outputs.add(new WriteEntity(part, WriteEntity.WriteType.INSERT));
// If partition already exists and we aren't overwriting it, then respect
// its current location info rather than picking it from the parent TableDesc
preservePartitionSpecs = true;
}
} else {
outputs.add(new WriteEntity(ts.tableHandle, (isOverWrite ? WriteEntity.WriteType.INSERT_OVERWRITE : WriteEntity.WriteType.INSERT)));
}
} catch (HiveException e) {
throw new SemanticException(e);
}
}
Long writeId = null;
int stmtId = -1;
boolean isTxnTable = AcidUtils.isTransactionalTable(ts.tableHandle);
if (isTxnTable) {
try {
writeId = getTxnMgr().getTableWriteId(ts.tableHandle.getDbName(), ts.tableHandle.getTableName());
} catch (LockException ex) {
throw new SemanticException("Failed to allocate the write id", ex);
}
stmtId = getTxnMgr().getStmtIdAndIncrement();
}
// Note: this sets LoadFileType incorrectly for ACID; is that relevant for load?
// See setLoadFileType and setIsAcidIow calls elsewhere for an example.
LoadTableDesc loadTableWork = new LoadTableDesc(new Path(fromURI), Utilities.getTableDesc(ts.tableHandle), partSpec, isOverWrite ? LoadFileType.REPLACE_ALL : LoadFileType.KEEP_EXISTING, writeId);
loadTableWork.setStmtId(stmtId);
loadTableWork.setInsertOverwrite(isOverWrite);
if (preservePartitionSpecs) {
// Note : preservePartitionSpecs=true implies inheritTableSpecs=false but
// but preservePartitionSpecs=false(default) here is not sufficient enough
// info to set inheritTableSpecs=true
loadTableWork.setInheritTableSpecs(false);
}
Task<?> childTask = TaskFactory.get(new MoveWork(getInputs(), getOutputs(), loadTableWork, null, true, isLocal));
rootTasks.add(childTask);
// The user asked for stats to be collected.
// Some stats like number of rows require a scan of the data
// However, some other stats, like number of files, do not require a complete scan
// Update the stats which do not require a complete scan.
Task<?> statTask = null;
if (conf.getBoolVar(HiveConf.ConfVars.HIVESTATSAUTOGATHER)) {
BasicStatsWork basicStatsWork = new BasicStatsWork(loadTableWork);
basicStatsWork.setNoStatsAggregator(true);
basicStatsWork.setClearAggregatorStats(true);
StatsWork columnStatsWork = new StatsWork(ts.tableHandle, basicStatsWork, conf);
statTask = TaskFactory.get(columnStatsWork);
}
if (statTask != null) {
childTask.addDependentTask(statTask);
}
}
Aggregations