use of org.apache.hadoop.hive.ql.plan.MapredLocalWork in project hive by apache.
the class ExecDriver method execute.
/**
* Execute a query plan using Hadoop.
*/
@SuppressWarnings({ "deprecation", "unchecked" })
@Override
public int execute(DriverContext driverContext) {
IOPrepareCache ioPrepareCache = IOPrepareCache.get();
ioPrepareCache.clear();
boolean success = true;
Context ctx = driverContext.getCtx();
boolean ctxCreated = false;
Path emptyScratchDir;
JobClient jc = null;
if (driverContext.isShutdown()) {
LOG.warn("Task was cancelled");
return 5;
}
MapWork mWork = work.getMapWork();
ReduceWork rWork = work.getReduceWork();
try {
if (ctx == null) {
ctx = new Context(job);
ctxCreated = true;
}
emptyScratchDir = ctx.getMRTmpPath();
FileSystem fs = emptyScratchDir.getFileSystem(job);
fs.mkdirs(emptyScratchDir);
} catch (IOException e) {
e.printStackTrace();
console.printError("Error launching map-reduce job", "\n" + org.apache.hadoop.util.StringUtils.stringifyException(e));
return 5;
}
HiveFileFormatUtils.prepareJobOutput(job);
//See the javadoc on HiveOutputFormatImpl and HadoopShims.prepareJobOutput()
job.setOutputFormat(HiveOutputFormatImpl.class);
job.setMapperClass(ExecMapper.class);
job.setMapOutputKeyClass(HiveKey.class);
job.setMapOutputValueClass(BytesWritable.class);
try {
String partitioner = HiveConf.getVar(job, ConfVars.HIVEPARTITIONER);
job.setPartitionerClass(JavaUtils.loadClass(partitioner));
} catch (ClassNotFoundException e) {
throw new RuntimeException(e.getMessage(), e);
}
propagateSplitSettings(job, mWork);
job.setNumReduceTasks(rWork != null ? rWork.getNumReduceTasks().intValue() : 0);
job.setReducerClass(ExecReducer.class);
// set input format information if necessary
setInputAttributes(job);
// Turn on speculative execution for reducers
boolean useSpeculativeExecReducers = HiveConf.getBoolVar(job, HiveConf.ConfVars.HIVESPECULATIVEEXECREDUCERS);
job.setBoolean(MRJobConfig.REDUCE_SPECULATIVE, useSpeculativeExecReducers);
String inpFormat = HiveConf.getVar(job, HiveConf.ConfVars.HIVEINPUTFORMAT);
if (mWork.isUseBucketizedHiveInputFormat()) {
inpFormat = BucketizedHiveInputFormat.class.getName();
}
LOG.info("Using " + inpFormat);
try {
job.setInputFormat(JavaUtils.loadClass(inpFormat));
} catch (ClassNotFoundException e) {
throw new RuntimeException(e.getMessage(), e);
}
// No-Op - we don't really write anything here ..
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
int returnVal = 0;
boolean noName = StringUtils.isEmpty(job.get(MRJobConfig.JOB_NAME));
if (noName) {
// This is for a special case to ensure unit tests pass
job.set(MRJobConfig.JOB_NAME, "JOB" + Utilities.randGen.nextInt());
}
try {
MapredLocalWork localwork = mWork.getMapRedLocalWork();
if (localwork != null && localwork.hasStagedAlias()) {
if (!ShimLoader.getHadoopShims().isLocalMode(job)) {
Path localPath = localwork.getTmpPath();
Path hdfsPath = mWork.getTmpHDFSPath();
FileSystem hdfs = hdfsPath.getFileSystem(job);
FileSystem localFS = localPath.getFileSystem(job);
FileStatus[] hashtableFiles = localFS.listStatus(localPath);
int fileNumber = hashtableFiles.length;
String[] fileNames = new String[fileNumber];
for (int i = 0; i < fileNumber; i++) {
fileNames[i] = hashtableFiles[i].getPath().getName();
}
//package and compress all the hashtable files to an archive file
String stageId = this.getId();
String archiveFileName = Utilities.generateTarFileName(stageId);
localwork.setStageID(stageId);
CompressionUtils.tar(localPath.toUri().getPath(), fileNames, archiveFileName);
Path archivePath = Utilities.generateTarPath(localPath, stageId);
LOG.info("Archive " + hashtableFiles.length + " hash table files to " + archivePath);
//upload archive file to hdfs
Path hdfsFilePath = Utilities.generateTarPath(hdfsPath, stageId);
short replication = (short) job.getInt("mapred.submit.replication", 10);
hdfs.copyFromLocalFile(archivePath, hdfsFilePath);
hdfs.setReplication(hdfsFilePath, replication);
LOG.info("Upload 1 archive file from" + archivePath + " to: " + hdfsFilePath);
//add the archive file to distributed cache
DistributedCache.createSymlink(job);
DistributedCache.addCacheArchive(hdfsFilePath.toUri(), job);
LOG.info("Add 1 archive file to distributed cache. Archive file: " + hdfsFilePath.toUri());
}
}
work.configureJobConf(job);
List<Path> inputPaths = Utilities.getInputPaths(job, mWork, emptyScratchDir, ctx, false);
Utilities.setInputPaths(job, inputPaths);
Utilities.setMapRedWork(job, work, ctx.getMRTmpPath());
if (mWork.getSamplingType() > 0 && rWork != null && job.getNumReduceTasks() > 1) {
try {
handleSampling(ctx, mWork, job);
job.setPartitionerClass(HiveTotalOrderPartitioner.class);
} catch (IllegalStateException e) {
console.printInfo("Not enough sampling data.. Rolling back to single reducer task");
rWork.setNumReduceTasks(1);
job.setNumReduceTasks(1);
} catch (Exception e) {
LOG.error("Sampling error", e);
console.printError(e.toString(), "\n" + org.apache.hadoop.util.StringUtils.stringifyException(e));
rWork.setNumReduceTasks(1);
job.setNumReduceTasks(1);
}
}
jc = new JobClient(job);
// make this client wait if job tracker is not behaving well.
Throttle.checkJobTracker(job, LOG);
if (mWork.isGatheringStats() || (rWork != null && rWork.isGatheringStats())) {
// initialize stats publishing table
StatsPublisher statsPublisher;
StatsFactory factory = StatsFactory.newFactory(job);
if (factory != null) {
statsPublisher = factory.getStatsPublisher();
List<String> statsTmpDir = Utilities.getStatsTmpDirs(mWork, job);
if (rWork != null) {
statsTmpDir.addAll(Utilities.getStatsTmpDirs(rWork, job));
}
StatsCollectionContext sc = new StatsCollectionContext(job);
sc.setStatsTmpDirs(statsTmpDir);
if (!statsPublisher.init(sc)) {
// creating stats table if not exists
if (HiveConf.getBoolVar(job, HiveConf.ConfVars.HIVE_STATS_RELIABLE)) {
throw new HiveException(ErrorMsg.STATSPUBLISHER_INITIALIZATION_ERROR.getErrorCodedMsg());
}
}
}
}
Utilities.createTmpDirs(job, mWork);
Utilities.createTmpDirs(job, rWork);
SessionState ss = SessionState.get();
if (HiveConf.getVar(job, HiveConf.ConfVars.HIVE_EXECUTION_ENGINE).equals("tez") && ss != null) {
TezSessionState session = ss.getTezSession();
TezSessionPoolManager.getInstance().closeIfNotDefault(session, true);
}
HiveConfUtil.updateJobCredentialProviders(job);
// Finally SUBMIT the JOB!
if (driverContext.isShutdown()) {
LOG.warn("Task was cancelled");
return 5;
}
rj = jc.submitJob(job);
if (driverContext.isShutdown()) {
LOG.warn("Task was cancelled");
if (rj != null) {
rj.killJob();
rj = null;
}
return 5;
}
this.jobID = rj.getJobID();
updateStatusInQueryDisplay();
returnVal = jobExecHelper.progress(rj, jc, ctx);
success = (returnVal == 0);
} catch (Exception e) {
e.printStackTrace();
setException(e);
String mesg = " with exception '" + Utilities.getNameMessage(e) + "'";
if (rj != null) {
mesg = "Ended Job = " + rj.getJobID() + mesg;
} else {
mesg = "Job Submission failed" + mesg;
}
// Has to use full name to make sure it does not conflict with
// org.apache.commons.lang.StringUtils
console.printError(mesg, "\n" + org.apache.hadoop.util.StringUtils.stringifyException(e));
success = false;
returnVal = 1;
} finally {
Utilities.clearWork(job);
try {
if (ctxCreated) {
ctx.clear();
}
if (rj != null) {
if (returnVal != 0) {
rj.killJob();
}
jobID = rj.getID().toString();
}
if (jc != null) {
jc.close();
}
} catch (Exception e) {
LOG.warn("Failed while cleaning up ", e);
} finally {
HadoopJobExecHelper.runningJobs.remove(rj);
}
}
// get the list of Dynamic partition paths
try {
if (rj != null) {
if (mWork.getAliasToWork() != null) {
for (Operator<? extends OperatorDesc> op : mWork.getAliasToWork().values()) {
op.jobClose(job, success);
}
}
if (rWork != null) {
rWork.getReducer().jobClose(job, success);
}
}
} catch (Exception e) {
// jobClose needs to execute successfully otherwise fail task
if (success) {
setException(e);
success = false;
returnVal = 3;
String mesg = "Job Commit failed with exception '" + Utilities.getNameMessage(e) + "'";
console.printError(mesg, "\n" + org.apache.hadoop.util.StringUtils.stringifyException(e));
}
}
return (returnVal);
}
use of org.apache.hadoop.hive.ql.plan.MapredLocalWork in project hive by apache.
the class MapredLocalTask method executeInChildVM.
public int executeInChildVM(DriverContext driverContext) {
// execute in child jvm
try {
// generate the cmd line to run in the child jvm
Context ctx = driverContext.getCtx();
String hiveJar = conf.getJar();
String hadoopExec = conf.getVar(HiveConf.ConfVars.HADOOPBIN);
conf.setVar(ConfVars.HIVEADDEDJARS, Utilities.getResourceFiles(conf, SessionState.ResourceType.JAR));
// write out the plan to a local file
Path planPath = new Path(ctx.getLocalTmpPath(), "plan.xml");
MapredLocalWork plan = getWork();
LOG.info("Generating plan file " + planPath.toString());
OutputStream out = null;
try {
out = FileSystem.getLocal(conf).create(planPath);
SerializationUtilities.serializePlan(plan, out);
out.close();
out = null;
} finally {
IOUtils.closeQuietly(out);
}
String isSilent = "true".equalsIgnoreCase(System.getProperty("test.silent")) ? "-nolog" : "";
String jarCmd;
jarCmd = hiveJar + " " + ExecDriver.class.getName();
String hiveConfArgs = ExecDriver.generateCmdLine(conf, ctx);
String cmdLine = hadoopExec + " jar " + jarCmd + " -localtask -plan " + planPath.toString() + " " + isSilent + " " + hiveConfArgs;
String workDir = (new File(".")).getCanonicalPath();
String files = Utilities.getResourceFiles(conf, SessionState.ResourceType.FILE);
if (!files.isEmpty()) {
cmdLine = cmdLine + " -files " + files;
workDir = ctx.getLocalTmpPath().toUri().getPath();
if (!(new File(workDir)).mkdir()) {
throw new IOException("Cannot create tmp working dir: " + workDir);
}
for (String f : StringUtils.split(files, ',')) {
Path p = new Path(f);
String target = p.toUri().getPath();
String link = workDir + Path.SEPARATOR + p.getName();
if (FileUtil.symLink(target, link) != 0) {
throw new IOException("Cannot link to added file: " + target + " from: " + link);
}
}
}
// Inherit Java system variables
String hadoopOpts;
StringBuilder sb = new StringBuilder();
Properties p = System.getProperties();
for (String element : HIVE_SYS_PROP) {
if (p.containsKey(element)) {
sb.append(" -D" + element + "=" + p.getProperty(element));
}
}
hadoopOpts = sb.toString();
// Inherit the environment variables
String[] env;
Map<String, String> variables = new HashMap<String, String>(System.getenv());
// The user can specify the hadoop memory
// if ("local".equals(conf.getVar(HiveConf.ConfVars.HADOOPJT))) {
// if we are running in local mode - then the amount of memory used
// by the child jvm can no longer default to the memory used by the
// parent jvm
// int hadoopMem = conf.getIntVar(HiveConf.ConfVars.HIVEHADOOPMAXMEM);
int hadoopMem = conf.getIntVar(HiveConf.ConfVars.HIVEHADOOPMAXMEM);
if (hadoopMem == 0) {
// remove env var that would default child jvm to use parent's memory
// as default. child jvm would use default memory for a hadoop client
variables.remove(HADOOP_MEM_KEY);
} else {
// user specified the memory for local mode hadoop run
console.printInfo(" set heap size\t" + hadoopMem + "MB");
variables.put(HADOOP_MEM_KEY, String.valueOf(hadoopMem));
}
// } else {
// nothing to do - we are not running in local mode - only submitting
// the job via a child process. in this case it's appropriate that the
// child jvm use the same memory as the parent jvm
// }
//Set HADOOP_USER_NAME env variable for child process, so that
// it also runs with hadoop permissions for the user the job is running as
// This will be used by hadoop only in unsecure(/non kerberos) mode
String endUserName = Utils.getUGI().getShortUserName();
LOG.debug("setting HADOOP_USER_NAME\t" + endUserName);
variables.put("HADOOP_USER_NAME", endUserName);
if (variables.containsKey(HADOOP_OPTS_KEY)) {
variables.put(HADOOP_OPTS_KEY, variables.get(HADOOP_OPTS_KEY) + hadoopOpts);
} else {
variables.put(HADOOP_OPTS_KEY, hadoopOpts);
}
//Hiveserver2 using "-hiveconf hive.hadoop.classpath=%HIVE_LIB%". This is to combine path(s).
if (HiveConf.getVar(conf, HiveConf.ConfVars.HIVE_HADOOP_CLASSPATH) != null) {
if (variables.containsKey("HADOOP_CLASSPATH")) {
variables.put("HADOOP_CLASSPATH", variables.get("HADOOP_CLASSPATH") + ";" + HiveConf.getVar(conf, HiveConf.ConfVars.HIVE_HADOOP_CLASSPATH));
} else {
variables.put("HADOOP_CLASSPATH", HiveConf.getVar(conf, HiveConf.ConfVars.HIVE_HADOOP_CLASSPATH));
}
}
if (variables.containsKey(MapRedTask.HIVE_DEBUG_RECURSIVE)) {
MapRedTask.configureDebugVariablesForChildJVM(variables);
}
if (UserGroupInformation.isSecurityEnabled() && UserGroupInformation.isLoginKeytabBased()) {
//If kerberos security is enabled, and HS2 doAs is enabled,
// then additional params need to be set so that the command is run as
// intended user
secureDoAs = new SecureCmdDoAs(conf);
secureDoAs.addEnv(variables);
}
// have different settings from those of HiveServer2.
if (variables.containsKey(HIVE_LOCAL_TASK_CHILD_OPTS_KEY)) {
String childOpts = variables.get(HIVE_LOCAL_TASK_CHILD_OPTS_KEY);
if (childOpts == null) {
childOpts = "";
}
String clientOpts = variables.put(HADOOP_CLIENT_OPTS, childOpts);
String tmp = variables.get(HADOOP_OPTS_KEY);
if (tmp != null && !StringUtils.isBlank(clientOpts)) {
tmp = tmp.replace(clientOpts, childOpts);
variables.put(HADOOP_OPTS_KEY, tmp);
}
}
env = new String[variables.size()];
int pos = 0;
for (Map.Entry<String, String> entry : variables.entrySet()) {
String name = entry.getKey();
String value = entry.getValue();
env[pos++] = name + "=" + value;
LOG.debug("Setting env: " + name + "=" + LogUtils.maskIfPassword(name, value));
}
LOG.info("Executing: " + cmdLine);
// Run ExecDriver in another JVM
executor = Runtime.getRuntime().exec(cmdLine, env, new File(workDir));
CachingPrintStream errPrintStream = new CachingPrintStream(System.err);
StreamPrinter outPrinter;
StreamPrinter errPrinter;
OperationLog operationLog = OperationLog.getCurrentOperationLog();
if (operationLog != null) {
outPrinter = new StreamPrinter(executor.getInputStream(), null, System.out, operationLog.getPrintStream());
errPrinter = new StreamPrinter(executor.getErrorStream(), null, errPrintStream, operationLog.getPrintStream());
} else {
outPrinter = new StreamPrinter(executor.getInputStream(), null, System.out);
errPrinter = new StreamPrinter(executor.getErrorStream(), null, errPrintStream);
}
outPrinter.start();
errPrinter.start();
int exitVal = jobExecHelper.progressLocal(executor, getId());
// wait for stream threads to finish
outPrinter.join();
errPrinter.join();
if (exitVal != 0) {
LOG.error("Execution failed with exit status: " + exitVal);
if (SessionState.get() != null) {
SessionState.get().addLocalMapRedErrors(getId(), errPrintStream.getOutput());
}
} else {
LOG.info("Execution completed successfully");
}
return exitVal;
} catch (Exception e) {
LOG.error("Exception: ", e);
return (1);
} finally {
if (secureDoAs != null) {
secureDoAs.close();
}
}
}
use of org.apache.hadoop.hive.ql.plan.MapredLocalWork in project hive by apache.
the class HashTableLoader method loadDirectly.
private void loadDirectly(MapJoinTableContainer[] mapJoinTables, String inputFileName) throws Exception {
MapredLocalWork localWork = context.getLocalWork();
List<Operator<?>> directWorks = localWork.getDirectFetchOp().get(joinOp);
if (directWorks == null || directWorks.isEmpty()) {
return;
}
JobConf job = new JobConf(hconf);
MapredLocalTask localTask = new MapredLocalTask(localWork, job, false);
HashTableSinkOperator sink = new TemporaryHashSinkOperator(new CompilationOpContext(), desc);
sink.setParentOperators(new ArrayList<Operator<? extends OperatorDesc>>(directWorks));
for (Operator<?> operator : directWorks) {
if (operator != null) {
operator.setChildOperators(Arrays.<Operator<? extends OperatorDesc>>asList(sink));
}
}
localTask.setExecContext(context);
localTask.startForward(inputFileName);
MapJoinTableContainer[] tables = sink.getMapJoinTables();
for (int i = 0; i < sink.getNumParent(); i++) {
if (sink.getParentOperators().get(i) != null) {
mapJoinTables[i] = tables[i];
}
}
Arrays.fill(tables, null);
}
use of org.apache.hadoop.hive.ql.plan.MapredLocalWork in project hive by apache.
the class HashTableLoader method load.
@Override
public void load(MapJoinTableContainer[] mapJoinTables, MapJoinTableContainerSerDe[] mapJoinTableSerdes) throws HiveException {
// Note: it's possible that a MJ operator is in a ReduceWork, in which case the
// currentInputPath will be null. But, since currentInputPath is only interesting
// for bucket join case, and for bucket join the MJ operator will always be in
// a MapWork, this should be OK.
String currentInputPath = context.getCurrentInputPath() == null ? null : context.getCurrentInputPath().toString();
LOG.info("******* Load from HashTable for input file: " + currentInputPath);
MapredLocalWork localWork = context.getLocalWork();
try {
if (localWork.getDirectFetchOp() != null) {
loadDirectly(mapJoinTables, currentInputPath);
}
// All HashTables share the same base dir,
// which is passed in as the tmp path
Path baseDir = localWork.getTmpPath();
if (baseDir == null) {
return;
}
FileSystem fs = FileSystem.get(baseDir.toUri(), hconf);
BucketMapJoinContext mapJoinCtx = localWork.getBucketMapjoinContext();
boolean firstContainer = true;
boolean useOptimizedContainer = !useFastContainer && HiveConf.getBoolVar(hconf, HiveConf.ConfVars.HIVEMAPJOINUSEOPTIMIZEDTABLE);
for (int pos = 0; pos < mapJoinTables.length; pos++) {
if (pos == desc.getPosBigTable() || mapJoinTables[pos] != null) {
continue;
}
if (useOptimizedContainer) {
MapJoinObjectSerDeContext keyCtx = mapJoinTableSerdes[pos].getKeyContext();
ObjectInspector keyOI = keyCtx.getSerDe().getObjectInspector();
if (!MapJoinBytesTableContainer.isSupportedKey(keyOI)) {
if (firstContainer) {
LOG.warn("Not using optimized table container." + "Only a subset of mapjoin keys is supported.");
useOptimizedContainer = false;
HiveConf.setBoolVar(hconf, HiveConf.ConfVars.HIVEMAPJOINUSEOPTIMIZEDTABLE, false);
} else {
throw new HiveException("Only a subset of mapjoin keys is supported.");
}
}
}
firstContainer = false;
String bigInputPath = currentInputPath;
if (currentInputPath != null && mapJoinCtx != null) {
if (!desc.isBucketMapJoin()) {
bigInputPath = null;
} else {
Set<String> aliases = ((SparkBucketMapJoinContext) mapJoinCtx).getPosToAliasMap().get(pos);
String alias = aliases.iterator().next();
// Any one small table input path
String smallInputPath = mapJoinCtx.getAliasBucketFileNameMapping().get(alias).get(bigInputPath).get(0);
bigInputPath = mapJoinCtx.getMappingBigFile(alias, smallInputPath);
}
}
String fileName = localWork.getBucketFileName(bigInputPath);
Path path = Utilities.generatePath(baseDir, desc.getDumpFilePrefix(), (byte) pos, fileName);
mapJoinTables[pos] = load(fs, path, mapJoinTableSerdes[pos]);
}
} catch (Exception e) {
throw new HiveException(e);
}
}
use of org.apache.hadoop.hive.ql.plan.MapredLocalWork in project hive by apache.
the class CommonJoinTaskDispatcher method mergeMapJoinTaskIntoItsChildMapRedTask.
/*
* A task and its child task has been converted from join to mapjoin.
* See if the two tasks can be merged.
*/
private void mergeMapJoinTaskIntoItsChildMapRedTask(MapRedTask mapJoinTask, Configuration conf) throws SemanticException {
// If so, check if we can merge mapJoinTask into that child.
if (mapJoinTask.getChildTasks() == null || mapJoinTask.getChildTasks().size() > 1) {
// child-tasks in which case we don't want to do anything.
return;
}
Task<? extends Serializable> childTask = mapJoinTask.getChildTasks().get(0);
if (!(childTask instanceof MapRedTask)) {
// Nothing to do if it is not a MapReduce task.
return;
}
MapRedTask childMapRedTask = (MapRedTask) childTask;
MapWork mapJoinMapWork = mapJoinTask.getWork().getMapWork();
MapWork childMapWork = childMapRedTask.getWork().getMapWork();
Map<String, Operator<? extends OperatorDesc>> mapJoinAliasToWork = mapJoinMapWork.getAliasToWork();
if (mapJoinAliasToWork.size() > 1) {
// Do not merge if the MapredWork of MapJoin has multiple input aliases.
return;
}
Entry<String, Operator<? extends OperatorDesc>> mapJoinAliasToWorkEntry = mapJoinAliasToWork.entrySet().iterator().next();
String mapJoinAlias = mapJoinAliasToWorkEntry.getKey();
TableScanOperator mapJoinTaskTableScanOperator = OperatorUtils.findSingleOperator(mapJoinAliasToWorkEntry.getValue(), TableScanOperator.class);
if (mapJoinTaskTableScanOperator == null) {
throw new SemanticException("Expected a " + TableScanOperator.getOperatorName() + " operator as the work associated with alias " + mapJoinAlias + ". Found a " + mapJoinAliasToWork.get(mapJoinAlias).getName() + " operator.");
}
FileSinkOperator mapJoinTaskFileSinkOperator = OperatorUtils.findSingleOperator(mapJoinTaskTableScanOperator, FileSinkOperator.class);
if (mapJoinTaskFileSinkOperator == null) {
throw new SemanticException("Cannot find the " + FileSinkOperator.getOperatorName() + " operator at the last operator of the MapJoin Task.");
}
// The mapJoinTaskFileSinkOperator writes to a different directory
Path childMRPath = mapJoinTaskFileSinkOperator.getConf().getDirName();
List<String> childMRAliases = childMapWork.getPathToAliases().get(childMRPath);
if (childMRAliases == null || childMRAliases.size() != 1) {
return;
}
String childMRAlias = childMRAliases.get(0);
// Sanity check to make sure there is no alias conflict after merge.
for (Entry<Path, ArrayList<String>> entry : childMapWork.getPathToAliases().entrySet()) {
Path path = entry.getKey();
List<String> aliases = entry.getValue();
if (path.equals(childMRPath)) {
continue;
}
if (aliases.contains(mapJoinAlias)) {
// alias confict should not happen here.
return;
}
}
MapredLocalWork mapJoinLocalWork = mapJoinMapWork.getMapRedLocalWork();
MapredLocalWork childLocalWork = childMapWork.getMapRedLocalWork();
if ((mapJoinLocalWork != null && mapJoinLocalWork.getBucketMapjoinContext() != null) || (childLocalWork != null && childLocalWork.getBucketMapjoinContext() != null)) {
// We should relax this constraint with a follow-up jira.
return;
}
// is under the limit.
if (!isLocalTableTotalSizeUnderLimitAfterMerge(conf, mapJoinLocalWork, childLocalWork)) {
// Do not merge.
return;
}
TableScanOperator childMRTaskTableScanOperator = OperatorUtils.findSingleOperator(childMapWork.getAliasToWork().get(childMRAlias.toString()), TableScanOperator.class);
if (childMRTaskTableScanOperator == null) {
throw new SemanticException("Expected a " + TableScanOperator.getOperatorName() + " operator as the work associated with alias " + childMRAlias + ". Found a " + childMapWork.getAliasToWork().get(childMRAlias).getName() + " operator.");
}
List<Operator<? extends OperatorDesc>> parentsInMapJoinTask = mapJoinTaskFileSinkOperator.getParentOperators();
List<Operator<? extends OperatorDesc>> childrenInChildMRTask = childMRTaskTableScanOperator.getChildOperators();
if (parentsInMapJoinTask.size() > 1 || childrenInChildMRTask.size() > 1) {
// Do not merge if we do not know how to connect two operator trees.
return;
}
// Step 2: Merge mapJoinTask into the Map-side of its child.
// Step 2.1: Connect the operator trees of two MapRedTasks.
Operator<? extends OperatorDesc> parentInMapJoinTask = parentsInMapJoinTask.get(0);
Operator<? extends OperatorDesc> childInChildMRTask = childrenInChildMRTask.get(0);
parentInMapJoinTask.replaceChild(mapJoinTaskFileSinkOperator, childInChildMRTask);
childInChildMRTask.replaceParent(childMRTaskTableScanOperator, parentInMapJoinTask);
// Step 2.2: Replace the corresponding part childMRWork's MapWork.
GenMapRedUtils.replaceMapWork(mapJoinAlias, childMRAlias.toString(), mapJoinMapWork, childMapWork);
// Step 2.3: Fill up stuff in local work
if (mapJoinLocalWork != null) {
if (childLocalWork == null) {
childMapWork.setMapRedLocalWork(mapJoinLocalWork);
} else {
childLocalWork.getAliasToFetchWork().putAll(mapJoinLocalWork.getAliasToFetchWork());
childLocalWork.getAliasToWork().putAll(mapJoinLocalWork.getAliasToWork());
}
}
// Step 2.4: Remove this MapJoin task
List<Task<? extends Serializable>> parentTasks = mapJoinTask.getParentTasks();
mapJoinTask.setParentTasks(null);
mapJoinTask.setChildTasks(null);
childMapRedTask.getParentTasks().remove(mapJoinTask);
if (parentTasks != null) {
childMapRedTask.getParentTasks().addAll(parentTasks);
for (Task<? extends Serializable> parentTask : parentTasks) {
parentTask.getChildTasks().remove(mapJoinTask);
if (!parentTask.getChildTasks().contains(childMapRedTask)) {
parentTask.getChildTasks().add(childMapRedTask);
}
}
} else {
if (physicalContext.getRootTasks().contains(mapJoinTask)) {
physicalContext.removeFromRootTask(mapJoinTask);
if (childMapRedTask.getParentTasks() != null && childMapRedTask.getParentTasks().size() == 0 && !physicalContext.getRootTasks().contains(childMapRedTask)) {
physicalContext.addToRootTask(childMapRedTask);
}
}
}
if (childMapRedTask.getParentTasks().size() == 0) {
childMapRedTask.setParentTasks(null);
}
}
Aggregations