use of org.apache.hadoop.hive.ql.exec.TableScanOperator in project hive by apache.
the class GenMRUnion1 method processSubQueryUnionCreateIntermediate.
/**
* Process the union when the parent is a map-reduce job. Create a temporary
* output, and let the union task read from the temporary output.
*
* The files created for all the inputs are in the union context and later
* used to initialize the union plan
*
* @param parent
* @param child
* @param uTask
* @param ctx
* @param uCtxTask
*/
private void processSubQueryUnionCreateIntermediate(Operator<? extends OperatorDesc> parent, Operator<? extends OperatorDesc> child, Task<?> uTask, GenMRProcContext ctx, GenMRUnionCtx uCtxTask) {
ParseContext parseCtx = ctx.getParseCtx();
TableDesc tt_desc = PlanUtils.getIntermediateFileTableDesc(PlanUtils.getFieldSchemasFromRowSchema(parent.getSchema(), "temporarycol"));
// generate the temporary file
Context baseCtx = parseCtx.getContext();
Path taskTmpDir = baseCtx.getMRTmpPath();
// Create the temporary file, its corresponding FileSinkOperaotr, and
// its corresponding TableScanOperator.
TableScanOperator tableScanOp = GenMapRedUtils.createTemporaryFile(parent, child, taskTmpDir, tt_desc, parseCtx);
// Add the path to alias mapping
uCtxTask.addTaskTmpDir(taskTmpDir.toUri().toString());
uCtxTask.addTTDesc(tt_desc);
uCtxTask.addListTopOperators(tableScanOp);
// The union task is empty. The files created for all the inputs are
// assembled in the union context and later used to initialize the union
// plan
Task<?> currTask = ctx.getCurrTask();
currTask.addDependentTask(uTask);
if (ctx.getRootTasks().contains(uTask)) {
ctx.getRootTasks().remove(uTask);
if (!ctx.getRootTasks().contains(currTask) && shouldBeRootTask(currTask)) {
ctx.getRootTasks().add(currTask);
}
}
}
use of org.apache.hadoop.hive.ql.exec.TableScanOperator in project hive by apache.
the class GlobalLimitOptimizer method transform.
@Override
public ParseContext transform(ParseContext pctx) throws SemanticException {
Context ctx = pctx.getContext();
Map<String, TableScanOperator> topOps = pctx.getTopOps();
GlobalLimitCtx globalLimitCtx = pctx.getGlobalLimitCtx();
Map<String, SplitSample> nameToSplitSample = pctx.getNameToSplitSample();
// is used.
if (topOps.size() == 1 && !globalLimitCtx.ifHasTransformOrUDTF() && nameToSplitSample.isEmpty()) {
// Here we recursively check:
// 1. whether there are exact one LIMIT in the query
// 2. whether there is no aggregation, group-by, distinct, sort by,
// distributed by, or table sampling in any of the sub-query.
// The query only qualifies if both conditions are satisfied.
//
// Example qualified queries:
// CREATE TABLE ... AS SELECT col1, col2 FROM tbl LIMIT ..
// INSERT OVERWRITE TABLE ... SELECT col1, hash(col2), split(col1)
// FROM ... LIMIT...
// SELECT * FROM (SELECT col1 as col2 (SELECT * FROM ...) t1 LIMIT ...) t2);
//
TableScanOperator ts = topOps.values().iterator().next();
Table tab = ts.getConf().getTableMetadata();
if (tab.isNonNative()) {
LOG.info("Not enabling limit optimization on non native table: " + tab.getTableName());
return pctx;
}
// InputFormat.getSplits wont be called if no input path & TS Vertex will have 0 task parallelism
if (tab.getStorageHandler() == null) {
LimitOperator tempGlobalLimit = checkQbpForGlobalLimit(ts);
// query qualify for the optimization
if (tempGlobalLimit != null) {
LimitDesc tempGlobalLimitDesc = tempGlobalLimit.getConf();
Set<FilterOperator> filterOps = OperatorUtils.findOperators(ts, FilterOperator.class);
if (!tab.isPartitioned()) {
if (filterOps.size() == 0) {
Integer tempOffset = tempGlobalLimitDesc.getOffset();
globalLimitCtx.enableOpt(tempGlobalLimitDesc.getLimit(), (tempOffset == null) ? 0 : tempOffset);
}
} else {
// check if the pruner only contains partition columns
if (onlyContainsPartnCols(tab, filterOps)) {
String alias = (String) topOps.keySet().toArray()[0];
PrunedPartitionList partsList = pctx.getPrunedPartitions(alias, ts);
// the filter to prune correctly
if (!partsList.hasUnknownPartitions()) {
Integer tempOffset = tempGlobalLimitDesc.getOffset();
globalLimitCtx.enableOpt(tempGlobalLimitDesc.getLimit(), (tempOffset == null) ? 0 : tempOffset);
}
}
}
if (globalLimitCtx.isEnable()) {
LOG.info("Qualify the optimize that reduces input size for 'offset' for offset " + globalLimitCtx.getGlobalOffset());
LOG.info("Qualify the optimize that reduces input size for 'limit' for limit " + globalLimitCtx.getGlobalLimit());
}
}
}
}
return pctx;
}
use of org.apache.hadoop.hive.ql.exec.TableScanOperator in project hive by apache.
the class GenMRFileSink1 method processFS.
/**
* Process the FileSink operator to generate a MoveTask if necessary.
*
* @param fsOp
* current FileSink operator
* @param stack
* parent operators
* @param opProcCtx
* @param chDir
* whether the operator should be first output to a tmp dir and then merged
* to the final dir later
* @return the final file name to which the FileSinkOperator should store.
* @throws SemanticException
*/
private Path processFS(FileSinkOperator fsOp, Stack<Node> stack, NodeProcessorCtx opProcCtx, boolean chDir) throws SemanticException {
GenMRProcContext ctx = (GenMRProcContext) opProcCtx;
Task<?> currTask = ctx.getCurrTask();
// If the directory needs to be changed, send the new directory
Path dest = null;
List<FileSinkOperator> seenFSOps = ctx.getSeenFileSinkOps();
if (seenFSOps == null) {
seenFSOps = new ArrayList<FileSinkOperator>();
}
if (!seenFSOps.contains(fsOp)) {
seenFSOps.add(fsOp);
}
ctx.setSeenFileSinkOps(seenFSOps);
dest = GenMapRedUtils.createMoveTask(ctx.getCurrTask(), chDir, fsOp, ctx.getParseCtx(), ctx.getMvTask(), ctx.getConf(), ctx.getDependencyTaskForMultiInsert());
TableScanOperator currTopOp = ctx.getCurrTopOp();
String currAliasId = ctx.getCurrAliasId();
HashMap<Operator<? extends OperatorDesc>, Task<?>> opTaskMap = ctx.getOpTaskMap();
// If it is a map-only job, the task needs to be processed
if (currTopOp != null) {
Task<?> mapTask = opTaskMap.get(null);
if (mapTask == null) {
if (!ctx.isSeenOp(currTask, currTopOp)) {
GenMapRedUtils.setTaskPlan(currAliasId, currTopOp, currTask, false, ctx);
}
opTaskMap.put(null, currTask);
} else {
if (!ctx.isSeenOp(currTask, currTopOp)) {
GenMapRedUtils.setTaskPlan(currAliasId, currTopOp, mapTask, false, ctx);
} else {
UnionOperator currUnionOp = ctx.getCurrUnionOp();
if (currUnionOp != null) {
opTaskMap.put(null, currTask);
ctx.setCurrTopOp(null);
GenMapRedUtils.initUnionPlan(ctx, currUnionOp, currTask, false);
return dest;
}
}
// mapTask and currTask should be merged by and join/union operator
// (e.g., GenMRUnion1) which has multiple topOps.
// assert mapTask == currTask : "mapTask.id = " + mapTask.getId()
// + "; currTask.id = " + currTask.getId();
}
return dest;
}
UnionOperator currUnionOp = ctx.getCurrUnionOp();
if (currUnionOp != null) {
opTaskMap.put(null, currTask);
GenMapRedUtils.initUnionPlan(ctx, currUnionOp, currTask, false);
return dest;
}
return dest;
}
use of org.apache.hadoop.hive.ql.exec.TableScanOperator in project hive by apache.
the class ConvertJoinMapJoin method checkConvertJoinSMBJoin.
/*
* This method tries to convert a join to an SMB. This is done based on
* traits. If the sorted by columns are the same as the join columns then, we
* can convert the join to an SMB. Otherwise retain the bucket map join as it
* is still more efficient than a regular join.
*/
private boolean checkConvertJoinSMBJoin(JoinOperator joinOp, OptimizeTezProcContext context, int bigTablePosition, TezBucketJoinProcCtx tezBucketJoinProcCtx) throws SemanticException {
ReduceSinkOperator bigTableRS = (ReduceSinkOperator) joinOp.getParentOperators().get(bigTablePosition);
int numBuckets = bigTableRS.getParentOperators().get(0).getOpTraits().getNumBuckets();
int size = -1;
boolean shouldCheckExternalTables = context.conf.getBoolVar(HiveConf.ConfVars.HIVE_DISABLE_UNSAFE_EXTERNALTABLE_OPERATIONS);
StringBuilder sb = new StringBuilder();
for (Operator<?> parentOp : joinOp.getParentOperators()) {
if (shouldCheckExternalTables && hasExternalTableAncestor(parentOp, sb)) {
LOG.debug("External table {} found in join - disabling SMB join.", sb.toString());
return false;
}
// each side better have 0 or more RS. if either side is unbalanced, cannot convert.
// This is a workaround for now. Right fix would be to refactor code in the
// MapRecordProcessor and ReduceRecordProcessor with respect to the sources.
Set<ReduceSinkOperator> set = OperatorUtils.findOperatorsUpstream(parentOp.getParentOperators(), ReduceSinkOperator.class);
if (size < 0) {
size = set.size();
continue;
}
if (((size > 0) && (set.size() > 0)) || ((size == 0) && (set.size() == 0))) {
continue;
} else {
return false;
}
}
// transformation of the join operation
for (Operator<? extends OperatorDesc> parentOp : joinOp.getParentOperators()) {
if (!(parentOp instanceof ReduceSinkOperator)) {
// could be mux/demux operators. Currently not supported
LOG.debug("Found correlation optimizer operators. Cannot convert to SMB at this time.");
return false;
}
ReduceSinkOperator rsOp = (ReduceSinkOperator) parentOp;
List<ExprNodeDesc> keyCols = rsOp.getConf().getKeyCols();
// For SMB, the key column(s) in RS should be same as bucket column(s) and sort column(s)`
List<String> sortCols = rsOp.getOpTraits().getSortCols().get(0);
List<String> bucketCols = rsOp.getOpTraits().getBucketColNames().get(0);
if (sortCols.size() != keyCols.size() || bucketCols.size() != keyCols.size()) {
return false;
}
// Check columns.
for (int i = 0; i < sortCols.size(); i++) {
ExprNodeDesc sortCol = rsOp.getColumnExprMap().get(sortCols.get(i));
ExprNodeDesc bucketCol = rsOp.getColumnExprMap().get(bucketCols.get(i));
if (!(sortCol.isSame(keyCols.get(i)) && bucketCol.isSame(keyCols.get(i)))) {
return false;
}
}
// check Parent's traits are same as rs
OpTraits parentTraits = rsOp.getParentOperators().get(0).getOpTraits();
if (null == parentTraits) {
// programming error - shouldn't be null
return false;
}
if (!checkColEquality(parentTraits.getSortCols(), rsOp.getOpTraits().getSortCols(), rsOp.getColumnExprMap(), false)) {
LOG.info("We cannot convert to SMB because the sort column names do not match.");
return false;
}
if (!checkColEquality(parentTraits.getBucketColNames(), rsOp.getOpTraits().getBucketColNames(), rsOp.getColumnExprMap(), true)) {
LOG.info("We cannot convert to SMB because bucket column names do not match.");
return false;
}
}
if (numBuckets < 0) {
numBuckets = bigTableRS.getConf().getNumReducers();
}
tezBucketJoinProcCtx.setNumBuckets(numBuckets);
// With bucketing using two different versions. Version 1 for exiting
// tables and version 2 for new tables. All the inputs to the SMB must be
// from same version. This only applies to tables read directly and not
// intermediate outputs of joins/groupbys
int bucketingVersion = -1;
for (Operator<? extends OperatorDesc> parentOp : joinOp.getParentOperators()) {
// Check if the parent is coming from a table scan, if so, what is the version of it.
assert parentOp.getParentOperators() != null && parentOp.getParentOperators().size() == 1;
Operator<?> op = parentOp;
while (op != null && !(op instanceof TableScanOperator || op instanceof ReduceSinkOperator || op instanceof CommonJoinOperator)) {
// If op has parents it is guaranteed to be 1.
List<Operator<?>> parents = op.getParentOperators();
Preconditions.checkState(parents.size() == 0 || parents.size() == 1);
op = parents.size() == 1 ? parents.get(0) : null;
}
if (op instanceof TableScanOperator) {
int localVersion = ((TableScanOperator) op).getConf().getTableMetadata().getBucketingVersion();
if (bucketingVersion == -1) {
bucketingVersion = localVersion;
} else if (bucketingVersion != localVersion) {
// versions dont match, return false.
LOG.debug("SMB Join can't be performed due to bucketing version mismatch");
return false;
}
}
}
LOG.info("We can convert the join to an SMB join.");
return true;
}
use of org.apache.hadoop.hive.ql.exec.TableScanOperator in project hive by apache.
the class SparkSkewJoinProcFactory method splitTask.
/**
* If the join is not in a leaf ReduceWork, the spark task has to be split into 2 tasks.
*/
private static void splitTask(SparkTask currentTask, ReduceWork reduceWork, ParseContext parseContext) throws SemanticException {
SparkWork currentWork = currentTask.getWork();
Set<Operator<?>> reduceSinkSet = OperatorUtils.getOp(reduceWork, ReduceSinkOperator.class);
if (currentWork.getChildren(reduceWork).size() == 1 && canSplit(currentWork) && reduceSinkSet.size() == 1) {
ReduceSinkOperator reduceSink = (ReduceSinkOperator) reduceSinkSet.iterator().next();
BaseWork childWork = currentWork.getChildren(reduceWork).get(0);
SparkEdgeProperty originEdge = currentWork.getEdgeProperty(reduceWork, childWork);
// disconnect the reduce work from its child. this should produce two isolated sub graphs
currentWork.disconnect(reduceWork, childWork);
// move works following the current reduce work into a new spark work
SparkWork newWork = new SparkWork(parseContext.getConf().getVar(HiveConf.ConfVars.HIVEQUERYID));
newWork.add(childWork);
copyWorkGraph(currentWork, newWork, childWork);
// remove them from current spark work
for (BaseWork baseWork : newWork.getAllWorkUnsorted()) {
currentWork.remove(baseWork);
currentWork.getCloneToWork().remove(baseWork);
}
// create TS to read intermediate data
Context baseCtx = parseContext.getContext();
Path taskTmpDir = baseCtx.getMRTmpPath();
Operator<? extends OperatorDesc> rsParent = reduceSink.getParentOperators().get(0);
TableDesc tableDesc = PlanUtils.getIntermediateFileTableDesc(PlanUtils.getFieldSchemasFromRowSchema(rsParent.getSchema(), "temporarycol"));
// this will insert FS and TS between the RS and its parent
TableScanOperator tableScanOp = GenMapRedUtils.createTemporaryFile(rsParent, reduceSink, taskTmpDir, tableDesc, parseContext);
// create new MapWork
MapWork mapWork = PlanUtils.getMapRedWork().getMapWork();
mapWork.setName("Map " + GenSparkUtils.getUtils().getNextSeqNumber());
newWork.add(mapWork);
newWork.connect(mapWork, childWork, originEdge);
// setup the new map work
String streamDesc = taskTmpDir.toUri().toString();
if (GenMapRedUtils.needsTagging((ReduceWork) childWork)) {
Operator<? extends OperatorDesc> childReducer = ((ReduceWork) childWork).getReducer();
String id = null;
if (childReducer instanceof JoinOperator) {
if (parseContext.getJoinOps().contains(childReducer)) {
id = ((JoinOperator) childReducer).getConf().getId();
}
} else if (childReducer instanceof MapJoinOperator) {
if (parseContext.getMapJoinOps().contains(childReducer)) {
id = ((MapJoinOperator) childReducer).getConf().getId();
}
} else if (childReducer instanceof SMBMapJoinOperator) {
if (parseContext.getSmbMapJoinOps().contains(childReducer)) {
id = ((SMBMapJoinOperator) childReducer).getConf().getId();
}
}
if (id != null) {
streamDesc = id + ":$INTNAME";
} else {
streamDesc = "$INTNAME";
}
String origStreamDesc = streamDesc;
int pos = 0;
while (mapWork.getAliasToWork().get(streamDesc) != null) {
streamDesc = origStreamDesc.concat(String.valueOf(++pos));
}
}
GenMapRedUtils.setTaskPlan(taskTmpDir, streamDesc, tableScanOp, mapWork, false, tableDesc);
// insert the new task between current task and its child
@SuppressWarnings("unchecked") Task<?> newTask = TaskFactory.get(newWork);
List<Task<?>> childTasks = currentTask.getChildTasks();
// must have at most one child
if (childTasks != null && childTasks.size() > 0) {
Task<?> childTask = childTasks.get(0);
currentTask.removeDependentTask(childTask);
newTask.addDependentTask(childTask);
}
currentTask.addDependentTask(newTask);
newTask.setFetchSource(currentTask.isFetchSource());
}
}
Aggregations