use of org.apache.hadoop.hive.ql.plan.OperatorDesc in project hive by apache.
the class CombineHiveInputFormat method getCombineSplits.
/**
* Create Hive splits based on CombineFileSplit.
*/
private InputSplit[] getCombineSplits(JobConf job, int numSplits, Map<Path, PartitionDesc> pathToPartitionInfo) throws IOException {
init(job);
Map<Path, ArrayList<String>> pathToAliases = mrwork.getPathToAliases();
Map<String, Operator<? extends OperatorDesc>> aliasToWork = mrwork.getAliasToWork();
CombineFileInputFormatShim combine = ShimLoader.getHadoopShims().getCombineFileInputFormat();
InputSplit[] splits = null;
if (combine == null) {
splits = super.getSplits(job, numSplits);
return splits;
}
if (combine.getInputPathsShim(job).length == 0) {
throw new IOException("No input paths specified in job");
}
ArrayList<InputSplit> result = new ArrayList<InputSplit>();
// combine splits only from same tables and same partitions. Do not combine splits from multiple
// tables or multiple partitions.
Path[] paths = StringInternUtils.internUriStringsInPathArray(combine.getInputPathsShim(job));
List<Path> inpDirs = new ArrayList<Path>();
List<Path> inpFiles = new ArrayList<Path>();
Map<CombinePathInputFormat, CombineFilter> poolMap = new HashMap<CombinePathInputFormat, CombineFilter>();
Set<Path> poolSet = new HashSet<Path>();
LockedDriverState lDrvStat = LockedDriverState.getLockedDriverState();
for (Path path : paths) {
if (lDrvStat != null && lDrvStat.isAborted()) {
throw new IOException("Operation is Canceled. ");
}
PartitionDesc part = HiveFileFormatUtils.getFromPathRecursively(pathToPartitionInfo, path, IOPrepareCache.get().allocatePartitionDescMap());
TableDesc tableDesc = part.getTableDesc();
if ((tableDesc != null) && tableDesc.isNonNative()) {
return super.getSplits(job, numSplits);
}
// Use HiveInputFormat if any of the paths is not splittable
Class inputFormatClass = part.getInputFileFormatClass();
String inputFormatClassName = inputFormatClass.getName();
InputFormat inputFormat = getInputFormatFromCache(inputFormatClass, job);
String deserializerClassName = null;
try {
deserializerClassName = part.getDeserializer(job).getClass().getName();
} catch (Exception e) {
// ignore
}
FileSystem inpFs = path.getFileSystem(job);
// don't combine if inputformat is a SymlinkTextInputFormat
if (inputFormat instanceof SymlinkTextInputFormat) {
splits = super.getSplits(job, numSplits);
return splits;
}
Path filterPath = path;
// Does a pool exist for this path already
CombineFilter f = null;
List<Operator<? extends OperatorDesc>> opList = null;
if (!mrwork.isMapperCannotSpanPartns()) {
// if mapper can span partitions, make sure a splits does not contain multiple
// opList + inputFormatClassName + deserializerClassName combination
// This is done using the Map of CombinePathInputFormat to PathFilter
opList = HiveFileFormatUtils.doGetWorksFromPath(pathToAliases, aliasToWork, filterPath);
CombinePathInputFormat combinePathInputFormat = new CombinePathInputFormat(opList, inputFormatClassName, deserializerClassName);
f = poolMap.get(combinePathInputFormat);
if (f == null) {
f = new CombineFilter(filterPath);
LOG.info("CombineHiveInputSplit creating pool for " + path + "; using filter path " + filterPath);
combine.createPool(job, f);
poolMap.put(combinePathInputFormat, f);
} else {
LOG.debug("CombineHiveInputSplit: pool is already created for " + path + "; using filter path " + filterPath);
f.addPath(filterPath);
}
} else {
// but won't cross multiple partitions if the user has asked so.
if (!path.getFileSystem(job).getFileStatus(path).isDir()) {
// path is not directory
filterPath = path.getParent();
inpFiles.add(path);
poolSet.add(filterPath);
} else {
inpDirs.add(path);
}
}
}
// Processing directories
List<CombineFileSplit> iss = new ArrayList<CombineFileSplit>();
if (!mrwork.isMapperCannotSpanPartns()) {
// mapper can span partitions
// combine into as few as one split, subject to the PathFilters set
// using combine.createPool.
iss = Arrays.asList(combine.getSplits(job, 1));
} else {
for (Path path : inpDirs) {
processPaths(job, combine, iss, path);
}
if (inpFiles.size() > 0) {
// Processing files
for (Path filterPath : poolSet) {
combine.createPool(job, new CombineFilter(filterPath));
}
processPaths(job, combine, iss, inpFiles.toArray(new Path[0]));
}
}
if (mrwork.getNameToSplitSample() != null && !mrwork.getNameToSplitSample().isEmpty()) {
iss = sampleSplits(iss);
}
for (CombineFileSplit is : iss) {
CombineHiveInputSplit csplit = new CombineHiveInputSplit(job, is, pathToPartitionInfo);
result.add(csplit);
}
LOG.info("number of splits " + result.size());
return result.toArray(new InputSplit[result.size()]);
}
use of org.apache.hadoop.hive.ql.plan.OperatorDesc in project hive by apache.
the class AbstractSMBJoinProc method convertBucketMapJoinToSMBJoin.
// Convert the bucket map-join operator to a sort-merge map join operator
protected SMBMapJoinOperator convertBucketMapJoinToSMBJoin(MapJoinOperator mapJoinOp, SortBucketJoinProcCtx smbJoinContext) {
String[] srcs = smbJoinContext.getSrcs();
SMBMapJoinOperator smbJop = new SMBMapJoinOperator(mapJoinOp);
SMBJoinDesc smbJoinDesc = new SMBJoinDesc(mapJoinOp.getConf());
smbJop.setConf(smbJoinDesc);
HashMap<Byte, String> tagToAlias = new HashMap<Byte, String>();
for (int i = 0; i < srcs.length; i++) {
tagToAlias.put((byte) i, srcs[i]);
}
smbJoinDesc.setTagToAlias(tagToAlias);
int indexInListMapJoinNoReducer = this.pGraphContext.getListMapJoinOpsNoReducer().indexOf(mapJoinOp);
if (indexInListMapJoinNoReducer >= 0) {
this.pGraphContext.getListMapJoinOpsNoReducer().remove(indexInListMapJoinNoReducer);
this.pGraphContext.getListMapJoinOpsNoReducer().add(indexInListMapJoinNoReducer, smbJop);
}
Map<String, DummyStoreOperator> aliasToSink = new HashMap<String, DummyStoreOperator>();
// For all parents (other than the big table), insert a dummy store operator
/* Consider a query like:
*
* select * from
* (subq1 --> has a filter)
* join
* (subq2 --> has a filter)
* on some key
*
* Let us assume that subq1 is the small table (either specified by the user or inferred
* automatically). The following operator tree will be created:
*
* TableScan (subq1) --> Select --> Filter --> DummyStore
* \
* \ SMBJoin
* /
* /
* TableScan (subq2) --> Select --> Filter
*/
List<Operator<? extends OperatorDesc>> parentOperators = mapJoinOp.getParentOperators();
for (int i = 0; i < parentOperators.size(); i++) {
Operator<? extends OperatorDesc> par = parentOperators.get(i);
int index = par.getChildOperators().indexOf(mapJoinOp);
par.getChildOperators().remove(index);
if (i == smbJoinDesc.getPosBigTable()) {
par.getChildOperators().add(index, smbJop);
} else {
DummyStoreOperator dummyStoreOp = new DummyStoreOperator(par.getCompilationOpContext());
par.getChildOperators().add(index, dummyStoreOp);
List<Operator<? extends OperatorDesc>> childrenOps = new ArrayList<Operator<? extends OperatorDesc>>();
childrenOps.add(smbJop);
dummyStoreOp.setChildOperators(childrenOps);
List<Operator<? extends OperatorDesc>> parentOps = new ArrayList<Operator<? extends OperatorDesc>>();
parentOps.add(par);
dummyStoreOp.setParentOperators(parentOps);
aliasToSink.put(srcs[i], dummyStoreOp);
smbJop.getParentOperators().remove(i);
smbJop.getParentOperators().add(i, dummyStoreOp);
}
}
smbJoinDesc.setAliasToSink(aliasToSink);
List<Operator<? extends OperatorDesc>> childOps = mapJoinOp.getChildOperators();
for (int i = 0; i < childOps.size(); i++) {
Operator<? extends OperatorDesc> child = childOps.get(i);
int index = child.getParentOperators().indexOf(mapJoinOp);
child.getParentOperators().remove(index);
child.getParentOperators().add(index, smbJop);
}
// Data structures coming from QBJoinTree
smbJop.getConf().setQBJoinTreeProps(mapJoinOp.getConf());
//
pGraphContext.getSmbMapJoinOps().add(smbJop);
pGraphContext.getMapJoinOps().remove(mapJoinOp);
return smbJop;
}
use of org.apache.hadoop.hive.ql.plan.OperatorDesc in project hive by apache.
the class GenMRFileSink1 method processFS.
/**
* Process the FileSink operator to generate a MoveTask if necessary.
*
* @param fsOp
* current FileSink operator
* @param stack
* parent operators
* @param opProcCtx
* @param chDir
* whether the operator should be first output to a tmp dir and then merged
* to the final dir later
* @return the final file name to which the FileSinkOperator should store.
* @throws SemanticException
*/
private Path processFS(FileSinkOperator fsOp, Stack<Node> stack, NodeProcessorCtx opProcCtx, boolean chDir) throws SemanticException {
GenMRProcContext ctx = (GenMRProcContext) opProcCtx;
Task<? extends Serializable> currTask = ctx.getCurrTask();
// If the directory needs to be changed, send the new directory
Path dest = null;
List<FileSinkOperator> seenFSOps = ctx.getSeenFileSinkOps();
if (seenFSOps == null) {
seenFSOps = new ArrayList<FileSinkOperator>();
}
if (!seenFSOps.contains(fsOp)) {
seenFSOps.add(fsOp);
}
ctx.setSeenFileSinkOps(seenFSOps);
dest = GenMapRedUtils.createMoveTask(ctx.getCurrTask(), chDir, fsOp, ctx.getParseCtx(), ctx.getMvTask(), ctx.getConf(), ctx.getDependencyTaskForMultiInsert());
TableScanOperator currTopOp = ctx.getCurrTopOp();
String currAliasId = ctx.getCurrAliasId();
HashMap<Operator<? extends OperatorDesc>, Task<? extends Serializable>> opTaskMap = ctx.getOpTaskMap();
// If it is a map-only job, the task needs to be processed
if (currTopOp != null) {
Task<? extends Serializable> mapTask = opTaskMap.get(null);
if (mapTask == null) {
if (!ctx.isSeenOp(currTask, currTopOp)) {
GenMapRedUtils.setTaskPlan(currAliasId, currTopOp, currTask, false, ctx);
}
opTaskMap.put(null, currTask);
} else {
if (!ctx.isSeenOp(currTask, currTopOp)) {
GenMapRedUtils.setTaskPlan(currAliasId, currTopOp, mapTask, false, ctx);
} else {
UnionOperator currUnionOp = ctx.getCurrUnionOp();
if (currUnionOp != null) {
opTaskMap.put(null, currTask);
ctx.setCurrTopOp(null);
GenMapRedUtils.initUnionPlan(ctx, currUnionOp, currTask, false);
return dest;
}
}
// mapTask and currTask should be merged by and join/union operator
// (e.g., GenMRUnion1) which has multiple topOps.
// assert mapTask == currTask : "mapTask.id = " + mapTask.getId()
// + "; currTask.id = " + currTask.getId();
}
return dest;
}
UnionOperator currUnionOp = ctx.getCurrUnionOp();
if (currUnionOp != null) {
opTaskMap.put(null, currTask);
GenMapRedUtils.initUnionPlan(ctx, currUnionOp, currTask, false);
return dest;
}
return dest;
}
use of org.apache.hadoop.hive.ql.plan.OperatorDesc in project hive by apache.
the class GenMRFileSink1 method process.
/**
* File Sink Operator encountered.
*
* @param nd
* the file sink operator encountered
* @param opProcCtx
* context
*/
@Override
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx opProcCtx, Object... nodeOutputs) throws SemanticException {
GenMRProcContext ctx = (GenMRProcContext) opProcCtx;
ParseContext parseCtx = ctx.getParseCtx();
boolean chDir = false;
// we should look take the parent of fsOp's task as the current task.
FileSinkOperator fsOp = (FileSinkOperator) nd;
Map<Operator<? extends OperatorDesc>, GenMapRedCtx> mapCurrCtx = ctx.getMapCurrCtx();
GenMapRedCtx mapredCtx = mapCurrCtx.get(fsOp.getParentOperators().get(0));
Task<? extends Serializable> currTask = mapredCtx.getCurrTask();
ctx.setCurrTask(currTask);
ctx.addRootIfPossible(currTask);
// is INSERT OVERWRITE TABLE
boolean isInsertTable = GenMapRedUtils.isInsertInto(parseCtx, fsOp);
HiveConf hconf = parseCtx.getConf();
// Mark this task as a final map reduce task (ignoring the optional merge task)
((MapredWork) currTask.getWork()).setFinalMapRed(true);
// If this file sink desc has been processed due to a linked file sink desc,
// use that task
Map<FileSinkDesc, Task<? extends Serializable>> fileSinkDescs = ctx.getLinkedFileDescTasks();
if (fileSinkDescs != null) {
Task<? extends Serializable> childTask = fileSinkDescs.get(fsOp.getConf());
processLinkedFileDesc(ctx, childTask);
return true;
}
// So, no need to attempt to merge the files again.
if ((ctx.getSeenFileSinkOps() == null) || (!ctx.getSeenFileSinkOps().contains(nd))) {
chDir = GenMapRedUtils.isMergeRequired(ctx.getMvTask(), hconf, fsOp, currTask, isInsertTable);
}
Path finalName = processFS(fsOp, stack, opProcCtx, chDir);
if (chDir) {
// Merge the files in the destination table/partitions by creating Map-only merge job
// If underlying data is RCFile or OrcFile, RCFileBlockMerge task or
// OrcFileStripeMerge task would be created.
LOG.info("using CombineHiveInputformat for the merge job");
GenMapRedUtils.createMRWorkForMergingFiles(fsOp, finalName, ctx.getDependencyTaskForMultiInsert(), ctx.getMvTask(), hconf, currTask, parseCtx.getQueryState().getLineageState());
}
FileSinkDesc fileSinkDesc = fsOp.getConf();
if (fileSinkDesc.isLinkedFileSink()) {
Map<FileSinkDesc, Task<? extends Serializable>> linkedFileDescTasks = ctx.getLinkedFileDescTasks();
if (linkedFileDescTasks == null) {
linkedFileDescTasks = new HashMap<FileSinkDesc, Task<? extends Serializable>>();
ctx.setLinkedFileDescTasks(linkedFileDescTasks);
}
// The child tasks may be null in case of a select
if ((currTask.getChildTasks() != null) && (currTask.getChildTasks().size() == 1)) {
for (FileSinkDesc fileDesc : fileSinkDesc.getLinkedFileSinkDesc()) {
linkedFileDescTasks.put(fileDesc, currTask.getChildTasks().get(0));
}
}
}
FetchTask fetchTask = parseCtx.getFetchTask();
if (fetchTask != null && currTask.getNumChild() == 0) {
if (fetchTask.isFetchFrom(fileSinkDesc)) {
currTask.setFetchSource(true);
}
}
return true;
}
use of org.apache.hadoop.hive.ql.plan.OperatorDesc in project hive by apache.
the class GenMRRedSink1 method process.
/**
* Reduce Sink encountered.
* a) If we are seeing this RS for first time, we initialize plan corresponding to this RS.
* b) If we are seeing this RS for second or later time then either query had a join in which
* case we will merge this plan with earlier plan involving this RS or plan for this RS
* needs to be split in two branches.
*
* @param nd
* the reduce sink operator encountered
* @param opProcCtx
* context
*/
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx opProcCtx, Object... nodeOutputs) throws SemanticException {
ReduceSinkOperator op = (ReduceSinkOperator) nd;
GenMRProcContext ctx = (GenMRProcContext) opProcCtx;
Map<Operator<? extends OperatorDesc>, GenMapRedCtx> mapCurrCtx = ctx.getMapCurrCtx();
GenMapRedCtx mapredCtx = mapCurrCtx.get(stack.get(stack.size() - 2));
Task<? extends Serializable> currTask = mapredCtx.getCurrTask();
MapredWork currPlan = (MapredWork) currTask.getWork();
String currAliasId = mapredCtx.getCurrAliasId();
if (op.getNumChild() != 1) {
throw new IllegalStateException("Expecting operator " + op + " to have one child. " + "But found multiple children : " + op.getChildOperators());
}
Operator<? extends OperatorDesc> reducer = op.getChildOperators().get(0);
Task<? extends Serializable> oldTask = ctx.getOpTaskMap().get(reducer);
ctx.setCurrAliasId(currAliasId);
ctx.setCurrTask(currTask);
// If the plan for this reducer does not exist, initialize the plan
if (oldTask == null) {
if (currPlan.getReduceWork() == null) {
GenMapRedUtils.initPlan(op, ctx);
} else {
GenMapRedUtils.splitPlan(op, ctx);
}
} else {
// This will happen in case of joins. The current plan can be thrown away
// after being merged with the original plan
GenMapRedUtils.joinPlan(currTask, oldTask, ctx);
currTask = oldTask;
ctx.setCurrTask(currTask);
}
mapCurrCtx.put(op, new GenMapRedCtx(ctx.getCurrTask(), ctx.getCurrAliasId()));
if (GenMapRedUtils.hasBranchFinished(nodeOutputs)) {
ctx.addRootIfPossible(currTask);
return false;
}
return true;
}
Aggregations