use of org.apache.hadoop.hive.ql.exec.mr.MapRedTask in project hive by apache.
the class Utilities method reworkMapRedWork.
/**
* The check here is kind of not clean. It first use a for loop to go through
* all input formats, and choose the ones that extend ReworkMapredInputFormat
* to a set. And finally go through the ReworkMapredInputFormat set, and call
* rework for each one.
*
* Technically all these can be avoided if all Hive's input formats can share
* a same interface. As in today's hive and Hadoop, it is not possible because
* a lot of Hive's input formats are in Hadoop's code. And most of Hadoop's
* input formats just extend InputFormat interface.
*
* @param task
* @param reworkMapredWork
* @param conf
* @throws SemanticException
*/
public static void reworkMapRedWork(Task<?> task, boolean reworkMapredWork, HiveConf conf) throws SemanticException {
if (reworkMapredWork && (task instanceof MapRedTask)) {
try {
MapredWork mapredWork = ((MapRedTask) task).getWork();
Set<Class<? extends InputFormat>> reworkInputFormats = new HashSet<Class<? extends InputFormat>>();
for (PartitionDesc part : mapredWork.getMapWork().getPathToPartitionInfo().values()) {
Class<? extends InputFormat> inputFormatCls = part.getInputFileFormatClass();
if (ReworkMapredInputFormat.class.isAssignableFrom(inputFormatCls)) {
reworkInputFormats.add(inputFormatCls);
}
}
if (reworkInputFormats.size() > 0) {
for (Class<? extends InputFormat> inputFormatCls : reworkInputFormats) {
ReworkMapredInputFormat inst = (ReworkMapredInputFormat) ReflectionUtil.newInstance(inputFormatCls, null);
inst.rework(conf, mapredWork);
}
}
} catch (IOException e) {
throw new SemanticException(e);
}
}
}
use of org.apache.hadoop.hive.ql.exec.mr.MapRedTask in project hive by apache.
the class SortMergeJoinTaskDispatcher method convertSMBTaskToMapJoinTask.
// create map join task and set big table as bigTablePosition
private MapRedTask convertSMBTaskToMapJoinTask(MapredWork origWork, int bigTablePosition, SMBMapJoinOperator smbJoinOp) throws SemanticException {
// deep copy a new mapred work
MapredWork newWork = SerializationUtilities.clonePlan(origWork);
// create a mapred task for this work
MapRedTask newTask = (MapRedTask) TaskFactory.get(newWork);
// generate the map join operator; already checked the map join
MapJoinOperator newMapJoinOp = getMapJoinOperator(newTask, newWork, smbJoinOp, bigTablePosition);
// The reducer needs to be restored - Consider a query like:
// select count(*) FROM bucket_big a JOIN bucket_small b ON a.key = b.key;
// The reducer contains a groupby, which needs to be restored.
ReduceWork rWork = newWork.getReduceWork();
// create the local work for this plan
MapJoinProcessor.genLocalWorkForMapJoin(newWork, newMapJoinOp, bigTablePosition);
// restore the reducer
newWork.setReduceWork(rWork);
return newTask;
}
use of org.apache.hadoop.hive.ql.exec.mr.MapRedTask in project hive by apache.
the class SortMergeJoinTaskDispatcher method processCurrentTask.
@Override
public Task<?> processCurrentTask(MapRedTask currTask, ConditionalTask conditionalTask, Context context) throws SemanticException {
// whether it contains a sort merge join operator
MapredWork currWork = currTask.getWork();
SMBMapJoinOperator originalSMBJoinOp = getSMBMapJoinOp(currWork);
if (!isEligibleForOptimization(originalSMBJoinOp)) {
return null;
}
currTask.setTaskTag(Task.CONVERTED_SORTMERGEJOIN);
// Convert the work containing to sort-merge join into a work, as if it had a regular join.
// Note that the operator tree is not changed - is still contains the SMB join, but the
// plan is changed (aliasToWork etc.) to contain all the paths as if it was a regular join.
// This is used to convert the plan to a map-join, and then the original SMB join plan is used
// as a backup task.
MapredWork currJoinWork = convertSMBWorkToJoinWork(currWork, originalSMBJoinOp);
SMBMapJoinOperator newSMBJoinOp = getSMBMapJoinOp(currJoinWork);
currWork.getMapWork().setLeftInputJoin(originalSMBJoinOp.getConf().isLeftInputJoin());
currWork.getMapWork().setBaseSrc(originalSMBJoinOp.getConf().getBaseSrc());
currWork.getMapWork().setMapAliases(originalSMBJoinOp.getConf().getMapAliases());
currJoinWork.getMapWork().setLeftInputJoin(originalSMBJoinOp.getConf().isLeftInputJoin());
currJoinWork.getMapWork().setBaseSrc(originalSMBJoinOp.getConf().getBaseSrc());
currJoinWork.getMapWork().setMapAliases(originalSMBJoinOp.getConf().getMapAliases());
// create conditional work list and task list
List<Serializable> listWorks = new ArrayList<Serializable>();
List<Task<?>> listTasks = new ArrayList<Task<?>>();
// create task to aliases mapping and alias to input file mapping for resolver
// Must be deterministic order map for consistent q-test output across Java versions
HashMap<Task<?>, Set<String>> taskToAliases = new LinkedHashMap<Task<?>, Set<String>>();
// Note that pathToAlias will behave as if the original plan was a join plan
Map<Path, List<String>> pathToAliases = currJoinWork.getMapWork().getPathToAliases();
// generate a map join task for the big table
SMBJoinDesc originalSMBJoinDesc = originalSMBJoinOp.getConf();
Byte[] order = originalSMBJoinDesc.getTagOrder();
int numAliases = order.length;
Set<Integer> bigTableCandidates = MapJoinProcessor.getBigTableCandidates(originalSMBJoinDesc.getConds());
HashMap<String, Long> aliasToSize = new HashMap<String, Long>();
Configuration conf = context.getConf();
try {
long aliasTotalKnownInputSize = getTotalKnownInputSize(context, currJoinWork.getMapWork(), pathToAliases, aliasToSize);
long ThresholdOfSmallTblSizeSum = HiveConf.getLongVar(conf, HiveConf.ConfVars.HIVESMALLTABLESFILESIZE);
for (int bigTablePosition = 0; bigTablePosition < numAliases; bigTablePosition++) {
// this table cannot be big table
if (!bigTableCandidates.contains(bigTablePosition)) {
continue;
}
// create map join task for the given big table position
MapRedTask newTask = convertSMBTaskToMapJoinTask(currJoinWork, bigTablePosition, newSMBJoinOp);
MapWork mapWork = newTask.getWork().getMapWork();
Operator<?> parentOp = originalSMBJoinOp.getParentOperators().get(bigTablePosition);
Set<String> aliases = GenMapRedUtils.findAliases(mapWork, parentOp);
long aliasKnownSize = Utilities.sumOf(aliasToSize, aliases);
if (aliasKnownSize > 0) {
long smallTblTotalKnownSize = aliasTotalKnownInputSize - aliasKnownSize;
if (smallTblTotalKnownSize > ThresholdOfSmallTblSizeSum) {
// this table is not good to be a big table.
continue;
}
}
// add into conditional task
listWorks.add(newTask.getWork());
listTasks.add(newTask);
newTask.setTaskTag(Task.CONVERTED_MAPJOIN);
newTask.setFetchSource(currTask.isFetchSource());
// set up backup task
newTask.setBackupTask(currTask);
newTask.setBackupChildrenTasks(currTask.getChildTasks());
// put the mapping task to aliases
taskToAliases.put(newTask, aliases);
}
} catch (Exception e) {
throw new SemanticException("Generate Map Join Task Error", e);
}
// insert current common join task to conditional task
listWorks.add(currTask.getWork());
listTasks.add(currTask);
// clear JoinTree and OP Parse Context
currWork.getMapWork().setLeftInputJoin(false);
currWork.getMapWork().setBaseSrc(null);
currWork.getMapWork().setMapAliases(null);
// create conditional task and insert conditional task into task tree
ConditionalWork cndWork = new ConditionalWork(listWorks);
ConditionalTask cndTsk = (ConditionalTask) TaskFactory.get(cndWork);
cndTsk.setListTasks(listTasks);
// set resolver and resolver context
cndTsk.setResolver(new ConditionalResolverCommonJoin());
ConditionalResolverCommonJoinCtx resolverCtx = new ConditionalResolverCommonJoinCtx();
resolverCtx.setPathToAliases(pathToAliases);
resolverCtx.setAliasToKnownSize(aliasToSize);
resolverCtx.setTaskToAliases(taskToAliases);
resolverCtx.setCommonJoinTask(currTask);
resolverCtx.setLocalTmpDir(context.getLocalScratchDir(false));
resolverCtx.setHdfsTmpDir(context.getMRScratchDir());
cndTsk.setResolverCtx(resolverCtx);
// replace the current task with the new generated conditional task
replaceTaskWithConditionalTask(currTask, cndTsk);
return cndTsk;
}
use of org.apache.hadoop.hive.ql.exec.mr.MapRedTask in project hive by apache.
the class TaskQueue method finished.
public void finished(TaskRunner runner) {
if (statsTasks.isEmpty() || !(runner.getTask() instanceof MapRedTask)) {
return;
}
MapRedTask mapredTask = (MapRedTask) runner.getTask();
MapWork mapWork = mapredTask.getWork().getMapWork();
ReduceWork reduceWork = mapredTask.getWork().getReduceWork();
List<Operator> operators = new ArrayList<Operator>(mapWork.getAliasToWork().values());
if (reduceWork != null) {
operators.add(reduceWork.getReducer());
}
final List<String> statKeys = new ArrayList<String>(1);
NodeUtils.iterate(operators, FileSinkOperator.class, new Function<FileSinkOperator>() {
@Override
public void apply(FileSinkOperator fsOp) {
if (fsOp.getConf().isGatherStats()) {
statKeys.add(fsOp.getConf().getStatsAggPrefix());
}
}
});
for (String statKey : statKeys) {
if (statsTasks.containsKey(statKey)) {
statsTasks.get(statKey).getWork().setSourceTask(mapredTask);
} else {
LOG.debug("There is no correspoing statTask for: " + statKey);
}
}
}
use of org.apache.hadoop.hive.ql.exec.mr.MapRedTask in project hive by apache.
the class TestSymlinkTextInputFormat method testCombine.
/**
* Test combine symlink text input file. Two input dir, and each contains one
* file, and then create one symlink file containing these 2 files. Normally
* without combine, it will return at least 2 splits
*/
@Test
public void testCombine() throws Exception {
JobConf newJob = new JobConf(job);
FileSystem fs = dataDir1.getFileSystem(newJob);
Path dir1_file1 = new Path(dataDir1, "combinefile1_1");
writeTextFile(dir1_file1, "dir1_file1_line1\n" + "dir1_file1_line2\n");
Path dir2_file1 = new Path(dataDir2, "combinefile2_1");
writeTextFile(dir2_file1, "dir2_file1_line1\n" + "dir2_file1_line2\n");
// A symlink file, contains first file from first dir and second file from
// second dir.
writeSymlinkFile(new Path(symlinkDir, "symlink_file"), new Path(dataDir1, "combinefile1_1"), new Path(dataDir2, "combinefile2_1"));
HiveConf hiveConf = new HiveConf(TestSymlinkTextInputFormat.class);
hiveConf.setVar(HiveConf.ConfVars.HIVE_AUTHORIZATION_MANAGER, "org.apache.hadoop.hive.ql.security.authorization.plugin.sqlstd.SQLStdHiveAuthorizerFactory");
HiveConf.setBoolVar(hiveConf, HiveConf.ConfVars.HIVE_REWORK_MAPREDWORK, true);
HiveConf.setBoolVar(hiveConf, HiveConf.ConfVars.HIVE_SUPPORT_CONCURRENCY, false);
Driver drv = new Driver(hiveConf);
String tblName = "text_symlink_text";
String createSymlinkTableCmd = "create table " + tblName + " (key int) stored as " + " inputformat 'org.apache.hadoop.hive.ql.io.SymlinkTextInputFormat' " + " outputformat 'org.apache.hadoop.hive.ql.io.IgnoreKeyTextOutputFormat'";
SessionState.start(hiveConf);
boolean tblCreated = false;
try {
int ecode = 0;
drv.run(createSymlinkTableCmd);
tblCreated = true;
String loadFileCommand = "LOAD DATA LOCAL INPATH '" + new Path(symlinkDir, "symlink_file").toString() + "' INTO TABLE " + tblName;
drv.run(loadFileCommand);
String cmd = "select key*1 from " + tblName;
ecode = drv.compile(cmd, true);
if (ecode != 0) {
throw new Exception("Select compile: " + cmd + " failed with exit code= " + ecode);
}
// create scratch dir
Context ctx = new Context(newJob);
Path emptyScratchDir = ctx.getMRTmpPath();
FileSystem fileSys = emptyScratchDir.getFileSystem(newJob);
fileSys.mkdirs(emptyScratchDir);
QueryPlan plan = drv.getPlan();
MapRedTask selectTask = (MapRedTask) plan.getRootTasks().get(0);
List<Path> inputPaths = Utilities.getInputPaths(newJob, selectTask.getWork().getMapWork(), emptyScratchDir, ctx, false);
Utilities.setInputPaths(newJob, inputPaths);
Utilities.setMapRedWork(newJob, selectTask.getWork(), ctx.getMRTmpPath());
CombineHiveInputFormat combineInputFormat = ReflectionUtils.newInstance(CombineHiveInputFormat.class, newJob);
InputSplit[] retSplits = combineInputFormat.getSplits(newJob, 1);
assertEquals(1, retSplits.length);
} catch (Exception e) {
e.printStackTrace();
fail("Caught exception " + e);
} finally {
if (tblCreated) {
drv.run("drop table text_symlink_text");
}
}
}
Aggregations