use of org.apache.hadoop.hive.ql.Context in project hive by apache.
the class TaskCompiler method compile.
@SuppressWarnings({ "nls", "unchecked" })
public void compile(final ParseContext pCtx, final List<Task<? extends Serializable>> rootTasks, final HashSet<ReadEntity> inputs, final HashSet<WriteEntity> outputs) throws SemanticException {
Context ctx = pCtx.getContext();
GlobalLimitCtx globalLimitCtx = pCtx.getGlobalLimitCtx();
List<Task<MoveWork>> mvTask = new ArrayList<Task<MoveWork>>();
List<LoadTableDesc> loadTableWork = pCtx.getLoadTableWork();
List<LoadFileDesc> loadFileWork = pCtx.getLoadFileWork();
boolean isCStats = pCtx.getQueryProperties().isAnalyzeRewrite();
int outerQueryLimit = pCtx.getQueryProperties().getOuterQueryLimit();
if (pCtx.getFetchTask() != null) {
if (pCtx.getFetchTask().getTblDesc() == null) {
return;
}
pCtx.getFetchTask().getWork().setHiveServerQuery(SessionState.get().isHiveServerQuery());
TableDesc resultTab = pCtx.getFetchTask().getTblDesc();
// then either the ThriftFormatter or the DefaultFetchFormatter should be used.
if (!resultTab.getSerdeClassName().equalsIgnoreCase(ThriftJDBCBinarySerDe.class.getName())) {
if (SessionState.get().isHiveServerQuery()) {
conf.set(SerDeUtils.LIST_SINK_OUTPUT_FORMATTER, ThriftFormatter.class.getName());
} else {
String formatterName = conf.get(SerDeUtils.LIST_SINK_OUTPUT_FORMATTER);
if (formatterName == null || formatterName.isEmpty()) {
conf.set(SerDeUtils.LIST_SINK_OUTPUT_FORMATTER, DefaultFetchFormatter.class.getName());
}
}
}
return;
}
optimizeOperatorPlan(pCtx, inputs, outputs);
/*
* In case of a select, use a fetch task instead of a move task.
* If the select is from analyze table column rewrite, don't create a fetch task. Instead create
* a column stats task later.
*/
if (pCtx.getQueryProperties().isQuery() && !isCStats) {
if ((!loadTableWork.isEmpty()) || (loadFileWork.size() != 1)) {
throw new SemanticException(ErrorMsg.INVALID_LOAD_TABLE_FILE_WORK.getMsg());
}
LoadFileDesc loadFileDesc = loadFileWork.get(0);
String cols = loadFileDesc.getColumns();
String colTypes = loadFileDesc.getColumnTypes();
String resFileFormat;
TableDesc resultTab = pCtx.getFetchTableDesc();
if (resultTab == null) {
resFileFormat = HiveConf.getVar(conf, HiveConf.ConfVars.HIVEQUERYRESULTFILEFORMAT);
if (SessionState.get().getIsUsingThriftJDBCBinarySerDe() && (resFileFormat.equalsIgnoreCase("SequenceFile"))) {
resultTab = PlanUtils.getDefaultQueryOutputTableDesc(cols, colTypes, resFileFormat, ThriftJDBCBinarySerDe.class);
// Set the fetch formatter to be a no-op for the ListSinkOperator, since we'll
// read formatted thrift objects from the output SequenceFile written by Tasks.
conf.set(SerDeUtils.LIST_SINK_OUTPUT_FORMATTER, NoOpFetchFormatter.class.getName());
} else {
resultTab = PlanUtils.getDefaultQueryOutputTableDesc(cols, colTypes, resFileFormat, LazySimpleSerDe.class);
}
} else {
if (resultTab.getProperties().getProperty(serdeConstants.SERIALIZATION_LIB).equalsIgnoreCase(ThriftJDBCBinarySerDe.class.getName())) {
// Set the fetch formatter to be a no-op for the ListSinkOperator, since we'll
// read formatted thrift objects from the output SequenceFile written by Tasks.
conf.set(SerDeUtils.LIST_SINK_OUTPUT_FORMATTER, NoOpFetchFormatter.class.getName());
}
}
FetchWork fetch = new FetchWork(loadFileDesc.getSourcePath(), resultTab, outerQueryLimit);
boolean isHiveServerQuery = SessionState.get().isHiveServerQuery();
fetch.setHiveServerQuery(isHiveServerQuery);
fetch.setSource(pCtx.getFetchSource());
fetch.setSink(pCtx.getFetchSink());
if (isHiveServerQuery && null != resultTab && resultTab.getSerdeClassName().equalsIgnoreCase(ThriftJDBCBinarySerDe.class.getName()) && HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_SERVER2_THRIFT_RESULTSET_SERIALIZE_IN_TASKS)) {
fetch.setIsUsingThriftJDBCBinarySerDe(true);
} else {
fetch.setIsUsingThriftJDBCBinarySerDe(false);
}
pCtx.setFetchTask((FetchTask) TaskFactory.get(fetch, conf));
// For the FetchTask, the limit optimization requires we fetch all the rows
// in memory and count how many rows we get. It's not practical if the
// limit factor is too big
int fetchLimit = HiveConf.getIntVar(conf, HiveConf.ConfVars.HIVELIMITOPTMAXFETCH);
if (globalLimitCtx.isEnable() && globalLimitCtx.getGlobalLimit() > fetchLimit) {
LOG.info("For FetchTask, LIMIT " + globalLimitCtx.getGlobalLimit() + " > " + fetchLimit + ". Doesn't qualify limit optimization.");
globalLimitCtx.disableOpt();
}
if (outerQueryLimit == 0) {
// Believe it or not, some tools do generate queries with limit 0 and than expect
// query to run quickly. Lets meet their requirement.
LOG.info("Limit 0. No query execution needed.");
return;
}
} else if (!isCStats) {
for (LoadTableDesc ltd : loadTableWork) {
Task<MoveWork> tsk = TaskFactory.get(new MoveWork(null, null, ltd, null, false), conf);
mvTask.add(tsk);
// Check to see if we are stale'ing any indexes and auto-update them if we want
if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVEINDEXAUTOUPDATE)) {
IndexUpdater indexUpdater = new IndexUpdater(loadTableWork, inputs, conf);
try {
List<Task<? extends Serializable>> indexUpdateTasks = indexUpdater.generateUpdateTasks();
for (Task<? extends Serializable> updateTask : indexUpdateTasks) {
tsk.addDependentTask(updateTask);
}
} catch (HiveException e) {
console.printInfo("WARNING: could not auto-update stale indexes, which are not in sync");
}
}
}
boolean oneLoadFile = true;
for (LoadFileDesc lfd : loadFileWork) {
if (pCtx.getQueryProperties().isCTAS() || pCtx.getQueryProperties().isMaterializedView()) {
// should not have more than 1 load file for
assert (oneLoadFile);
// CTAS
// make the movetask's destination directory the table's destination.
Path location;
String loc = pCtx.getQueryProperties().isCTAS() ? pCtx.getCreateTable().getLocation() : pCtx.getCreateViewDesc().getLocation();
if (loc == null) {
// get the default location
Path targetPath;
try {
String protoName = null;
if (pCtx.getQueryProperties().isCTAS()) {
protoName = pCtx.getCreateTable().getTableName();
} else if (pCtx.getQueryProperties().isMaterializedView()) {
protoName = pCtx.getCreateViewDesc().getViewName();
}
String[] names = Utilities.getDbTableName(protoName);
if (!db.databaseExists(names[0])) {
throw new SemanticException("ERROR: The database " + names[0] + " does not exist.");
}
Warehouse wh = new Warehouse(conf);
targetPath = wh.getTablePath(db.getDatabase(names[0]), names[1]);
} catch (HiveException e) {
throw new SemanticException(e);
} catch (MetaException e) {
throw new SemanticException(e);
}
location = targetPath;
} else {
location = new Path(loc);
}
lfd.setTargetDir(location);
oneLoadFile = false;
}
mvTask.add(TaskFactory.get(new MoveWork(null, null, null, lfd, false), conf));
}
}
generateTaskTree(rootTasks, pCtx, mvTask, inputs, outputs);
// For each task, set the key descriptor for the reducer
for (Task<? extends Serializable> rootTask : rootTasks) {
GenMapRedUtils.setKeyAndValueDescForTaskTree(rootTask);
}
// to be used, please do so
for (Task<? extends Serializable> rootTask : rootTasks) {
setInputFormat(rootTask);
}
optimizeTaskPlan(rootTasks, pCtx, ctx);
/*
* If the query was the result of analyze table column compute statistics rewrite, create
* a column stats task instead of a fetch task to persist stats to the metastore.
*/
if (isCStats || !pCtx.getColumnStatsAutoGatherContexts().isEmpty()) {
Set<Task<? extends Serializable>> leafTasks = new LinkedHashSet<Task<? extends Serializable>>();
getLeafTasks(rootTasks, leafTasks);
if (isCStats) {
genColumnStatsTask(pCtx.getAnalyzeRewrite(), loadFileWork, leafTasks, outerQueryLimit, 0);
} else {
for (ColumnStatsAutoGatherContext columnStatsAutoGatherContext : pCtx.getColumnStatsAutoGatherContexts()) {
if (!columnStatsAutoGatherContext.isInsertInto()) {
genColumnStatsTask(columnStatsAutoGatherContext.getAnalyzeRewrite(), columnStatsAutoGatherContext.getLoadFileWork(), leafTasks, outerQueryLimit, 0);
} else {
int numBitVector;
try {
numBitVector = HiveStatsUtils.getNumBitVectorsForNDVEstimation(conf);
} catch (Exception e) {
throw new SemanticException(e.getMessage());
}
genColumnStatsTask(columnStatsAutoGatherContext.getAnalyzeRewrite(), columnStatsAutoGatherContext.getLoadFileWork(), leafTasks, outerQueryLimit, numBitVector);
}
}
}
}
decideExecMode(rootTasks, ctx, globalLimitCtx);
if (pCtx.getQueryProperties().isCTAS() && !pCtx.getCreateTable().isMaterialization()) {
// generate a DDL task and make it a dependent task of the leaf
CreateTableDesc crtTblDesc = pCtx.getCreateTable();
crtTblDesc.validate(conf);
Task<? extends Serializable> crtTblTask = TaskFactory.get(new DDLWork(inputs, outputs, crtTblDesc), conf);
patchUpAfterCTASorMaterializedView(rootTasks, outputs, crtTblTask);
} else if (pCtx.getQueryProperties().isMaterializedView()) {
// generate a DDL task and make it a dependent task of the leaf
CreateViewDesc viewDesc = pCtx.getCreateViewDesc();
Task<? extends Serializable> crtViewTask = TaskFactory.get(new DDLWork(inputs, outputs, viewDesc), conf);
patchUpAfterCTASorMaterializedView(rootTasks, outputs, crtViewTask);
}
if (globalLimitCtx.isEnable() && pCtx.getFetchTask() != null) {
LOG.info("set least row check for FetchTask: " + globalLimitCtx.getGlobalLimit());
pCtx.getFetchTask().getWork().setLeastNumRows(globalLimitCtx.getGlobalLimit());
}
if (globalLimitCtx.isEnable() && globalLimitCtx.getLastReduceLimitDesc() != null) {
LOG.info("set least row check for LimitDesc: " + globalLimitCtx.getGlobalLimit());
globalLimitCtx.getLastReduceLimitDesc().setLeastRows(globalLimitCtx.getGlobalLimit());
List<ExecDriver> mrTasks = Utilities.getMRTasks(rootTasks);
for (ExecDriver tsk : mrTasks) {
tsk.setRetryCmdWhenFail(true);
}
List<SparkTask> sparkTasks = Utilities.getSparkTasks(rootTasks);
for (SparkTask sparkTask : sparkTasks) {
sparkTask.setRetryCmdWhenFail(true);
}
}
Interner<TableDesc> interner = Interners.newStrongInterner();
for (Task<? extends Serializable> rootTask : rootTasks) {
GenMapRedUtils.internTableDesc(rootTask, interner);
GenMapRedUtils.deriveFinalExplainAttributes(rootTask, pCtx.getConf());
}
}
use of org.apache.hadoop.hive.ql.Context in project hive by apache.
the class UpdateDeleteSemanticAnalyzer method analyzeMerge.
/**
* Here we take a Merge statement AST and generate a semantically equivalent multi-insert
* statement to exectue. Each Insert leg represents a single WHEN clause. As much as possible,
* the new SQL statement is made to look like the input SQL statement so that it's easier to map
* Query Compiler errors from generated SQL to original one this way.
* The generated SQL is a complete representation of the original input for the same reason.
* In many places SemanticAnalyzer throws exceptions that contain (line, position) coordinates.
* If generated SQL doesn't have everything and is patched up later, these coordinates point to
* the wrong place.
*
* @throws SemanticException
*/
private void analyzeMerge(ASTNode tree) throws SemanticException {
currentOperation = Context.Operation.MERGE;
quotedIdenfierHelper = new IdentifierQuoter(ctx.getTokenRewriteStream());
/*
* See org.apache.hadoop.hive.ql.parse.TestMergeStatement for some examples of the merge AST
For example, given:
merge into acidTbl using nonAcidPart2 source ON acidTbl.a = source.a2
WHEN MATCHED THEN UPDATE set b = source.b2
WHEN NOT MATCHED THEN INSERT VALUES(source.a2, source.b2)
We get AST like this:
"(tok_merge " +
"(tok_tabname acidtbl) (tok_tabref (tok_tabname nonacidpart2) source) " +
"(= (. (tok_table_or_col acidtbl) a) (. (tok_table_or_col source) a2)) " +
"(tok_matched " +
"(tok_update " +
"(tok_set_columns_clause (= (tok_table_or_col b) (. (tok_table_or_col source) b2))))) " +
"(tok_not_matched " +
"tok_insert " +
"(tok_value_row (. (tok_table_or_col source) a2) (. (tok_table_or_col source) b2))))");
And need to produce a multi-insert like this to execute:
FROM acidTbl right outer join nonAcidPart2 ON acidTbl.a = source.a2
Insert into table acidTbl select nonAcidPart2.a2, nonAcidPart2.b2 where acidTbl.a is null
INSERT INTO TABLE acidTbl select target.ROW__ID, nonAcidPart2.a2, nonAcidPart2.b2 where nonAcidPart2.a2=acidTbl.a sort by acidTbl.ROW__ID
*/
/*todo: we need some sort of validation phase over original AST to make things user friendly; for example, if
original command refers to a column that doesn't exist, this will be caught when processing the rewritten query but
the errors will point at locations that the user can't map to anything
- VALUES clause must have the same number of values as target table (including partition cols). Part cols go last in Select clause of Insert as Select
todo: do we care to preserve comments in original SQL?
todo: check if identifiers are propertly escaped/quoted in the generated SQL - it's currently inconsistent
Look at UnparseTranslator.addIdentifierTranslation() - it does unescape + unparse...
todo: consider "WHEN NOT MATCHED BY SOURCE THEN UPDATE SET TargetTable.Col1 = SourceTable.Col1 "; what happens when source is empty? This should be a runtime error - maybe not
the outer side of ROJ is empty => the join produces 0 rows. If supporting WHEN NOT MATCHED BY SOURCE, then this should be a runtime error
*/
ASTNode target = (ASTNode) tree.getChild(0);
ASTNode source = (ASTNode) tree.getChild(1);
String targetName = getSimpleTableName(target);
String sourceName = getSimpleTableName(source);
ASTNode onClause = (ASTNode) tree.getChild(2);
String onClauseAsText = getMatchedText(onClause);
Table targetTable = getTargetTable(target);
validateTargetTable(targetTable);
List<ASTNode> whenClauses = findWhenClauses(tree);
StringBuilder rewrittenQueryStr = new StringBuilder("FROM\n");
rewrittenQueryStr.append(Indent).append(getFullTableNameForSQL(target));
if (isAliased(target)) {
rewrittenQueryStr.append(" ").append(targetName);
}
rewrittenQueryStr.append('\n');
rewrittenQueryStr.append(Indent).append(chooseJoinType(whenClauses)).append("\n");
if (source.getType() == HiveParser.TOK_SUBQUERY) {
//this includes the mandatory alias
rewrittenQueryStr.append(Indent).append(getMatchedText(source));
} else {
rewrittenQueryStr.append(Indent).append(getFullTableNameForSQL(source));
if (isAliased(source)) {
rewrittenQueryStr.append(" ").append(sourceName);
}
}
rewrittenQueryStr.append('\n');
rewrittenQueryStr.append(Indent).append("ON ").append(onClauseAsText).append('\n');
/**
* We allow at most 2 WHEN MATCHED clause, in which case 1 must be Update the other Delete
* If we have both update and delete, the 1st one (in SQL code) must have "AND <extra predicate>"
* so that the 2nd can ensure not to process the same rows.
* Update and Delete may be in any order. (Insert is always last)
*/
String extraPredicate = null;
int numWhenMatchedUpdateClauses = 0, numWhenMatchedDeleteClauses = 0;
for (ASTNode whenClause : whenClauses) {
switch(getWhenClauseOperation(whenClause).getType()) {
case HiveParser.TOK_INSERT:
handleInsert(whenClause, rewrittenQueryStr, target, onClause, targetTable, targetName, onClauseAsText);
break;
case HiveParser.TOK_UPDATE:
numWhenMatchedUpdateClauses++;
String s = handleUpdate(whenClause, rewrittenQueryStr, target, onClauseAsText, targetTable, extraPredicate);
if (numWhenMatchedUpdateClauses + numWhenMatchedDeleteClauses == 1) {
//i.e. it's the 1st WHEN MATCHED
extraPredicate = s;
}
break;
case HiveParser.TOK_DELETE:
numWhenMatchedDeleteClauses++;
String s1 = handleDelete(whenClause, rewrittenQueryStr, target, onClauseAsText, targetTable, extraPredicate);
if (numWhenMatchedUpdateClauses + numWhenMatchedDeleteClauses == 1) {
//i.e. it's the 1st WHEN MATCHED
extraPredicate = s1;
}
break;
default:
throw new IllegalStateException("Unexpected WHEN clause type: " + whenClause.getType() + addParseInfo(whenClause));
}
if (numWhenMatchedDeleteClauses > 1) {
throw new SemanticException(ErrorMsg.MERGE_TOO_MANY_DELETE, ctx.getCmd());
}
if (numWhenMatchedUpdateClauses > 1) {
throw new SemanticException(ErrorMsg.MERGE_TOO_MANY_UPDATE, ctx.getCmd());
}
}
if (numWhenMatchedDeleteClauses + numWhenMatchedUpdateClauses == 2 && extraPredicate == null) {
throw new SemanticException(ErrorMsg.MERGE_PREDIACTE_REQUIRED, ctx.getCmd());
}
boolean validating = handleCardinalityViolation(rewrittenQueryStr, target, onClauseAsText, targetTable, numWhenMatchedDeleteClauses == 0 && numWhenMatchedUpdateClauses == 0);
ReparseResult rr = parseRewrittenQuery(rewrittenQueryStr, ctx.getCmd());
Context rewrittenCtx = rr.rewrittenCtx;
ASTNode rewrittenTree = rr.rewrittenTree;
rewrittenCtx.setOperation(Context.Operation.MERGE);
//set dest name mapping on new context; 1st chid is TOK_FROM
for (int insClauseIdx = 1, whenClauseIdx = 0; insClauseIdx < rewrittenTree.getChildCount() - (validating ? 1 : 0); insClauseIdx++, whenClauseIdx++) {
//we've added Insert clauses in order or WHEN items in whenClauses
ASTNode insertClause = (ASTNode) rewrittenTree.getChild(insClauseIdx);
switch(getWhenClauseOperation(whenClauses.get(whenClauseIdx)).getType()) {
case HiveParser.TOK_INSERT:
rewrittenCtx.addDestNamePrefix(insClauseIdx, Context.DestClausePrefix.INSERT);
break;
case HiveParser.TOK_UPDATE:
rewrittenCtx.addDestNamePrefix(insClauseIdx, Context.DestClausePrefix.UPDATE);
break;
case HiveParser.TOK_DELETE:
rewrittenCtx.addDestNamePrefix(insClauseIdx, Context.DestClausePrefix.DELETE);
break;
default:
assert false;
}
}
if (validating) {
//here means the last branch of the multi-insert is Cardinality Validation
rewrittenCtx.addDestNamePrefix(rewrittenTree.getChildCount() - 1, Context.DestClausePrefix.INSERT);
}
try {
useSuper = true;
super.analyze(rewrittenTree, rewrittenCtx);
} finally {
useSuper = false;
}
updateOutputs(targetTable);
}
use of org.apache.hadoop.hive.ql.Context in project hive by apache.
the class MoveTask method releaseLocks.
// Release all the locks acquired for this object
// This becomes important for multi-table inserts when one branch may take much more
// time than the others. It is better to release the lock for this particular insert.
// The other option is to wait for all the branches to finish, or set
// hive.multi.insert.move.tasks.share.dependencies to true, which will mean that the
// first multi-insert results will be available when all of the branches of multi-table
// inserts are done.
private void releaseLocks(LoadTableDesc ltd) throws HiveException {
// nothing needs to be done
if (!conf.getBoolVar(HiveConf.ConfVars.HIVE_SUPPORT_CONCURRENCY)) {
return;
}
Context ctx = driverContext.getCtx();
if (ctx.getHiveTxnManager().supportsAcid()) {
//Acid LM doesn't maintain getOutputLockObjects(); this 'if' just makes it more explicit
return;
}
HiveLockManager lockMgr = ctx.getHiveTxnManager().getLockManager();
WriteEntity output = ctx.getLoadTableOutputMap().get(ltd);
List<HiveLockObj> lockObjects = ctx.getOutputLockObjects().get(output);
if (lockObjects == null) {
return;
}
for (HiveLockObj lockObj : lockObjects) {
List<HiveLock> locks = lockMgr.getLocks(lockObj.getObj(), false, true);
for (HiveLock lock : locks) {
if (lock.getHiveLockMode() == lockObj.getMode()) {
if (ctx.getHiveLocks().remove(lock)) {
LOG.info("about to release lock for output: " + output.toString() + " lock: " + lock.getHiveLockObject().getName());
try {
lockMgr.unlock(lock);
} catch (LockException le) {
// should be OK since the lock is ephemeral and will eventually be deleted
// when the query finishes and zookeeper session is closed.
LOG.warn("Could not release lock " + lock.getHiveLockObject().getName());
}
}
}
}
}
}
use of org.apache.hadoop.hive.ql.Context in project hive by apache.
the class GenMapRedUtils method createMoveTask.
/**
* Create and add any dependent move tasks
*
* @param currTask
* @param chDir
* @param fsOp
* @param parseCtx
* @param mvTasks
* @param hconf
* @param dependencyTask
* @return
*/
public static Path createMoveTask(Task<? extends Serializable> currTask, boolean chDir, FileSinkOperator fsOp, ParseContext parseCtx, List<Task<MoveWork>> mvTasks, HiveConf hconf, DependencyCollectionTask dependencyTask) {
Path dest = null;
if (chDir) {
FileSinkDesc fileSinkDesc = fsOp.getConf();
dest = fileSinkDesc.getFinalDirName();
// generate the temporary file
// it must be on the same file system as the current destination
Context baseCtx = parseCtx.getContext();
// Create the required temporary file in the HDFS location if the destination
// path of the FileSinkOperator table is a blobstore path.
Path tmpDir = baseCtx.getTempDirForPath(fileSinkDesc.getDestPath(), true);
// Change all the linked file sink descriptors
if (fileSinkDesc.isLinkedFileSink()) {
for (FileSinkDesc fsConf : fileSinkDesc.getLinkedFileSinkDesc()) {
fsConf.setParentDir(tmpDir);
fsConf.setDirName(new Path(tmpDir, fsConf.getDirName().getName()));
}
} else {
fileSinkDesc.setDirName(tmpDir);
}
}
Task<MoveWork> mvTask = null;
if (!chDir) {
mvTask = GenMapRedUtils.findMoveTask(mvTasks, fsOp);
}
// Set the move task to be dependent on the current task
if (mvTask != null) {
GenMapRedUtils.addDependentMoveTasks(mvTask, hconf, currTask, dependencyTask);
}
return dest;
}
use of org.apache.hadoop.hive.ql.Context in project hive by apache.
the class SparkSkewJoinProcFactory method splitTask.
/**
* If the join is not in a leaf ReduceWork, the spark task has to be split into 2 tasks.
*/
private static void splitTask(SparkTask currentTask, ReduceWork reduceWork, ParseContext parseContext) throws SemanticException {
SparkWork currentWork = currentTask.getWork();
Set<Operator<?>> reduceSinkSet = SparkMapJoinResolver.getOp(reduceWork, ReduceSinkOperator.class);
if (currentWork.getChildren(reduceWork).size() == 1 && canSplit(currentWork) && reduceSinkSet.size() == 1) {
ReduceSinkOperator reduceSink = (ReduceSinkOperator) reduceSinkSet.iterator().next();
BaseWork childWork = currentWork.getChildren(reduceWork).get(0);
SparkEdgeProperty originEdge = currentWork.getEdgeProperty(reduceWork, childWork);
// disconnect the reduce work from its child. this should produce two isolated sub graphs
currentWork.disconnect(reduceWork, childWork);
// move works following the current reduce work into a new spark work
SparkWork newWork = new SparkWork(parseContext.getConf().getVar(HiveConf.ConfVars.HIVEQUERYID));
newWork.add(childWork);
copyWorkGraph(currentWork, newWork, childWork);
// remove them from current spark work
for (BaseWork baseWork : newWork.getAllWorkUnsorted()) {
currentWork.remove(baseWork);
currentWork.getCloneToWork().remove(baseWork);
}
// create TS to read intermediate data
Context baseCtx = parseContext.getContext();
Path taskTmpDir = baseCtx.getMRTmpPath();
Operator<? extends OperatorDesc> rsParent = reduceSink.getParentOperators().get(0);
TableDesc tableDesc = PlanUtils.getIntermediateFileTableDesc(PlanUtils.getFieldSchemasFromRowSchema(rsParent.getSchema(), "temporarycol"));
// this will insert FS and TS between the RS and its parent
TableScanOperator tableScanOp = GenMapRedUtils.createTemporaryFile(rsParent, reduceSink, taskTmpDir, tableDesc, parseContext);
// create new MapWork
MapWork mapWork = PlanUtils.getMapRedWork().getMapWork();
mapWork.setName("Map " + GenSparkUtils.getUtils().getNextSeqNumber());
newWork.add(mapWork);
newWork.connect(mapWork, childWork, originEdge);
// setup the new map work
String streamDesc = taskTmpDir.toUri().toString();
if (GenMapRedUtils.needsTagging((ReduceWork) childWork)) {
Operator<? extends OperatorDesc> childReducer = ((ReduceWork) childWork).getReducer();
String id = null;
if (childReducer instanceof JoinOperator) {
if (parseContext.getJoinOps().contains(childReducer)) {
id = ((JoinOperator) childReducer).getConf().getId();
}
} else if (childReducer instanceof MapJoinOperator) {
if (parseContext.getMapJoinOps().contains(childReducer)) {
id = ((MapJoinOperator) childReducer).getConf().getId();
}
} else if (childReducer instanceof SMBMapJoinOperator) {
if (parseContext.getSmbMapJoinOps().contains(childReducer)) {
id = ((SMBMapJoinOperator) childReducer).getConf().getId();
}
}
if (id != null) {
streamDesc = id + ":$INTNAME";
} else {
streamDesc = "$INTNAME";
}
String origStreamDesc = streamDesc;
int pos = 0;
while (mapWork.getAliasToWork().get(streamDesc) != null) {
streamDesc = origStreamDesc.concat(String.valueOf(++pos));
}
}
GenMapRedUtils.setTaskPlan(taskTmpDir, streamDesc, tableScanOp, mapWork, false, tableDesc);
// insert the new task between current task and its child
@SuppressWarnings("unchecked") Task<? extends Serializable> newTask = TaskFactory.get(newWork, parseContext.getConf());
List<Task<? extends Serializable>> childTasks = currentTask.getChildTasks();
// must have at most one child
if (childTasks != null && childTasks.size() > 0) {
Task<? extends Serializable> childTask = childTasks.get(0);
currentTask.removeDependentTask(childTask);
newTask.addDependentTask(childTask);
}
currentTask.addDependentTask(newTask);
newTask.setFetchSource(currentTask.isFetchSource());
}
}
Aggregations