use of org.apache.hadoop.hive.ql.plan.OperatorDesc in project hive by apache.
the class GenMapRedUtils method initPlan.
/**
* Initialize the current plan by adding it to root tasks.
*
* @param op
* the reduce sink operator encountered
* @param opProcCtx
* processing context
*/
public static void initPlan(ReduceSinkOperator op, GenMRProcContext opProcCtx) throws SemanticException {
Operator<? extends OperatorDesc> reducer = op.getChildOperators().get(0);
Map<Operator<? extends OperatorDesc>, GenMapRedCtx> mapCurrCtx = opProcCtx.getMapCurrCtx();
GenMapRedCtx mapredCtx = mapCurrCtx.get(op.getParentOperators().get(0));
Task<? extends Serializable> currTask = mapredCtx.getCurrTask();
MapredWork plan = (MapredWork) currTask.getWork();
HashMap<Operator<? extends OperatorDesc>, Task<? extends Serializable>> opTaskMap = opProcCtx.getOpTaskMap();
TableScanOperator currTopOp = opProcCtx.getCurrTopOp();
opTaskMap.put(reducer, currTask);
plan.setReduceWork(new ReduceWork());
plan.getReduceWork().setReducer(reducer);
ReduceSinkDesc desc = op.getConf();
plan.getReduceWork().setNumReduceTasks(desc.getNumReducers());
if (needsTagging(plan.getReduceWork())) {
plan.getReduceWork().setNeedsTagging(true);
}
assert currTopOp != null;
String currAliasId = opProcCtx.getCurrAliasId();
if (!opProcCtx.isSeenOp(currTask, currTopOp)) {
setTaskPlan(currAliasId, currTopOp, currTask, false, opProcCtx);
}
currTopOp = null;
currAliasId = null;
opProcCtx.setCurrTask(currTask);
opProcCtx.setCurrTopOp(currTopOp);
opProcCtx.setCurrAliasId(currAliasId);
}
use of org.apache.hadoop.hive.ql.plan.OperatorDesc in project hive by apache.
the class GenMapRedUtils method setKeyAndValueDesc.
/**
* set key and value descriptor.
*
* @param plan
* current plan
* @param topOp
* current top operator in the path
*/
public static void setKeyAndValueDesc(ReduceWork plan, Operator<? extends OperatorDesc> topOp) {
if (topOp == null) {
return;
}
if (topOp instanceof ReduceSinkOperator) {
ReduceSinkOperator rs = (ReduceSinkOperator) topOp;
setKeyAndValueDesc(plan, rs);
} else {
List<Operator<? extends OperatorDesc>> children = topOp.getChildOperators();
if (children != null) {
for (Operator<? extends OperatorDesc> op : children) {
setKeyAndValueDesc(plan, op);
}
}
}
}
use of org.apache.hadoop.hive.ql.plan.OperatorDesc in project hive by apache.
the class GenMapRedUtils method splitTasks.
@SuppressWarnings("nls")
private static /**
* Split two tasks by creating a temporary file between them.
*
* @param op reduce sink operator being processed
* @param parentTask the parent task
* @param childTask the child task
* @param opProcCtx context
**/
void splitTasks(ReduceSinkOperator op, Task<? extends Serializable> parentTask, Task<? extends Serializable> childTask, GenMRProcContext opProcCtx) throws SemanticException {
if (op.getNumParent() != 1) {
throw new IllegalStateException("Expecting operator " + op + " to have one parent. " + "But found multiple parents : " + op.getParentOperators());
}
ParseContext parseCtx = opProcCtx.getParseCtx();
parentTask.addDependentTask(childTask);
// Root Task cannot depend on any other task, therefore childTask cannot be
// a root Task
List<Task<? extends Serializable>> rootTasks = opProcCtx.getRootTasks();
if (rootTasks.contains(childTask)) {
rootTasks.remove(childTask);
}
// Generate the temporary file name
Context baseCtx = parseCtx.getContext();
Path taskTmpDir = baseCtx.getMRTmpPath();
Operator<? extends OperatorDesc> parent = op.getParentOperators().get(0);
TableDesc tt_desc = PlanUtils.getIntermediateFileTableDesc(PlanUtils.getFieldSchemasFromRowSchema(parent.getSchema(), "temporarycol"));
// Create the temporary file, its corresponding FileSinkOperaotr, and
// its corresponding TableScanOperator.
TableScanOperator tableScanOp = createTemporaryFile(parent, op, taskTmpDir, tt_desc, parseCtx);
Map<Operator<? extends OperatorDesc>, GenMapRedCtx> mapCurrCtx = opProcCtx.getMapCurrCtx();
mapCurrCtx.put(tableScanOp, new GenMapRedCtx(childTask, null));
String streamDesc = taskTmpDir.toUri().toString();
MapredWork cplan = (MapredWork) childTask.getWork();
if (needsTagging(cplan.getReduceWork())) {
Operator<? extends OperatorDesc> reducerOp = cplan.getReduceWork().getReducer();
String id = null;
if (reducerOp instanceof JoinOperator) {
if (parseCtx.getJoinOps().contains(reducerOp)) {
id = ((JoinOperator) reducerOp).getConf().getId();
}
} else if (reducerOp instanceof MapJoinOperator) {
if (parseCtx.getMapJoinOps().contains(reducerOp)) {
id = ((MapJoinOperator) reducerOp).getConf().getId();
}
} else if (reducerOp instanceof SMBMapJoinOperator) {
if (parseCtx.getSmbMapJoinOps().contains(reducerOp)) {
id = ((SMBMapJoinOperator) reducerOp).getConf().getId();
}
}
if (id != null) {
streamDesc = id + ":$INTNAME";
} else {
streamDesc = "$INTNAME";
}
String origStreamDesc = streamDesc;
int pos = 0;
while (cplan.getMapWork().getAliasToWork().get(streamDesc) != null) {
streamDesc = origStreamDesc.concat(String.valueOf(++pos));
}
// TODO: Allocate work to remove the temporary files and make that
// dependent on the redTask
cplan.getReduceWork().setNeedsTagging(true);
}
// Add the path to alias mapping
setTaskPlan(taskTmpDir, streamDesc, tableScanOp, cplan.getMapWork(), false, tt_desc);
opProcCtx.setCurrTopOp(null);
opProcCtx.setCurrAliasId(null);
opProcCtx.setCurrTask(childTask);
opProcCtx.addRootIfPossible(parentTask);
}
use of org.apache.hadoop.hive.ql.plan.OperatorDesc in project hive by apache.
the class GenMapRedUtils method createMergeTask.
/**
* Create a block level merge task for RCFiles or stripe level merge task for
* ORCFiles
*
* @param fsInputDesc
* @param finalName
* @param ctx
* @param inputFormatClass
* @return MergeWork if table is stored as RCFile or ORCFile,
* null otherwise
*/
public static MapWork createMergeTask(FileSinkDesc fsInputDesc, Path finalName, boolean hasDynamicPartitions, CompilationOpContext ctx) throws SemanticException {
Path inputDir = fsInputDesc.getFinalDirName();
TableDesc tblDesc = fsInputDesc.getTableInfo();
List<Path> inputDirs = new ArrayList<Path>(1);
ArrayList<String> inputDirstr = new ArrayList<String>(1);
// in case of dynamic partitioning and list bucketing
if (!hasDynamicPartitions && !GenMapRedUtils.isSkewedStoredAsDirs(fsInputDesc)) {
inputDirs.add(inputDir);
}
inputDirstr.add(inputDir.toString());
// internal input format class for CombineHiveInputFormat
final Class<? extends InputFormat> internalIFClass;
if (tblDesc.getInputFileFormatClass().equals(RCFileInputFormat.class)) {
internalIFClass = RCFileBlockMergeInputFormat.class;
} else if (tblDesc.getInputFileFormatClass().equals(OrcInputFormat.class)) {
internalIFClass = OrcFileStripeMergeInputFormat.class;
} else {
throw new SemanticException("createMergeTask called on a table with file" + " format other than RCFile or ORCFile");
}
// create the merge file work
MergeFileWork work = new MergeFileWork(inputDirs, finalName, hasDynamicPartitions, tblDesc.getInputFileFormatClass().getName());
LinkedHashMap<Path, ArrayList<String>> pathToAliases = new LinkedHashMap<>();
pathToAliases.put(inputDir, inputDirstr);
work.setMapperCannotSpanPartns(true);
work.setPathToAliases(pathToAliases);
PartitionDesc pDesc = new PartitionDesc(tblDesc, null);
pDesc.setInputFileFormatClass(internalIFClass);
work.addPathToPartitionInfo(inputDir, pDesc);
work.setListBucketingCtx(fsInputDesc.getLbCtx());
// create alias to work which contains the merge operator
LinkedHashMap<String, Operator<? extends OperatorDesc>> aliasToWork = new LinkedHashMap<String, Operator<? extends OperatorDesc>>();
Operator<? extends OperatorDesc> mergeOp = null;
final FileMergeDesc fmd;
if (tblDesc.getInputFileFormatClass().equals(RCFileInputFormat.class)) {
fmd = new RCFileMergeDesc();
} else {
fmd = new OrcFileMergeDesc();
}
fmd.setDpCtx(fsInputDesc.getDynPartCtx());
fmd.setOutputPath(finalName);
fmd.setHasDynamicPartitions(work.hasDynamicPartitions());
fmd.setListBucketingAlterTableConcatenate(work.isListBucketingAlterTableConcatenate());
int lbLevel = work.getListBucketingCtx() == null ? 0 : work.getListBucketingCtx().calculateListBucketingLevel();
fmd.setListBucketingDepth(lbLevel);
mergeOp = OperatorFactory.get(ctx, fmd);
aliasToWork.put(inputDir.toString(), mergeOp);
work.setAliasToWork(aliasToWork);
return work;
}
use of org.apache.hadoop.hive.ql.plan.OperatorDesc in project hive by apache.
the class GenMapRedUtils method replaceMapWork.
/**
* Replace the Map-side operator tree associated with targetAlias in
* target with the Map-side operator tree associated with sourceAlias in source.
* @param sourceAlias
* @param targetAlias
* @param source
* @param target
*/
public static void replaceMapWork(String sourceAlias, String targetAlias, MapWork source, MapWork target) {
Map<Path, ArrayList<String>> sourcePathToAliases = source.getPathToAliases();
Map<Path, PartitionDesc> sourcePathToPartitionInfo = source.getPathToPartitionInfo();
Map<String, Operator<? extends OperatorDesc>> sourceAliasToWork = source.getAliasToWork();
Map<String, PartitionDesc> sourceAliasToPartnInfo = source.getAliasToPartnInfo();
LinkedHashMap<Path, ArrayList<String>> targetPathToAliases = target.getPathToAliases();
LinkedHashMap<Path, PartitionDesc> targetPathToPartitionInfo = target.getPathToPartitionInfo();
Map<String, Operator<? extends OperatorDesc>> targetAliasToWork = target.getAliasToWork();
Map<String, PartitionDesc> targetAliasToPartnInfo = target.getAliasToPartnInfo();
if (!sourceAliasToWork.containsKey(sourceAlias) || !targetAliasToWork.containsKey(targetAlias)) {
// with targetAlias in target.
return;
}
if (sourceAliasToWork.size() > 1) {
// how to merge.
return;
}
// Remove unnecessary information from target
targetAliasToWork.remove(targetAlias);
targetAliasToPartnInfo.remove(targetAlias);
List<Path> pathsToRemove = new ArrayList<>();
for (Entry<Path, ArrayList<String>> entry : targetPathToAliases.entrySet()) {
ArrayList<String> aliases = entry.getValue();
aliases.remove(targetAlias);
if (aliases.isEmpty()) {
pathsToRemove.add(entry.getKey());
}
}
for (Path pathToRemove : pathsToRemove) {
targetPathToAliases.remove(pathToRemove);
targetPathToPartitionInfo.remove(pathToRemove);
}
// Add new information from source to target
targetAliasToWork.put(sourceAlias, sourceAliasToWork.get(sourceAlias));
targetAliasToPartnInfo.putAll(sourceAliasToPartnInfo);
targetPathToPartitionInfo.putAll(sourcePathToPartitionInfo);
List<Path> pathsToAdd = new ArrayList<>();
for (Entry<Path, ArrayList<String>> entry : sourcePathToAliases.entrySet()) {
ArrayList<String> aliases = entry.getValue();
if (aliases.contains(sourceAlias)) {
pathsToAdd.add(entry.getKey());
}
}
for (Path pathToAdd : pathsToAdd) {
if (!targetPathToAliases.containsKey(pathToAdd)) {
targetPathToAliases.put(pathToAdd, new ArrayList<String>());
}
targetPathToAliases.get(pathToAdd).add(sourceAlias);
}
target.setPathToAliases(targetPathToAliases);
target.setPathToPartitionInfo(targetPathToPartitionInfo);
}
Aggregations