use of org.apache.hadoop.hive.ql.parse.ParseContext in project hive by apache.
the class GenMapRedUtils method splitPlan.
/**
* Met cRS in pOP(parentTask with RS)-cRS-cOP(noTask) case
* Create new child task for cRS-cOP and link two tasks by temporary file : pOP-FS / TS-cRS-cOP
*
* @param cRS
* the reduce sink operator encountered
* @param opProcCtx
* processing context
*/
static void splitPlan(ReduceSinkOperator cRS, GenMRProcContext opProcCtx) throws SemanticException {
// Generate a new task
ParseContext parseCtx = opProcCtx.getParseCtx();
Task<?> parentTask = opProcCtx.getCurrTask();
MapredWork childPlan = getMapRedWork(parseCtx);
Task<?> childTask = TaskFactory.get(childPlan);
Operator<? extends OperatorDesc> reducer = cRS.getChildOperators().get(0);
// Add the reducer
ReduceWork rWork = new ReduceWork();
childPlan.setReduceWork(rWork);
rWork.setReducer(reducer);
ReduceSinkDesc desc = cRS.getConf();
childPlan.getReduceWork().setNumReduceTasks(Integer.valueOf(desc.getNumReducers()));
opProcCtx.getOpTaskMap().put(reducer, childTask);
splitTasks(cRS, parentTask, childTask, opProcCtx);
}
use of org.apache.hadoop.hive.ql.parse.ParseContext in project hive by apache.
the class GenMRFileSink1 method process.
/**
* File Sink Operator encountered.
*
* @param nd
* the file sink operator encountered
* @param opProcCtx
* context
*/
@Override
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx opProcCtx, Object... nodeOutputs) throws SemanticException {
GenMRProcContext ctx = (GenMRProcContext) opProcCtx;
ParseContext parseCtx = ctx.getParseCtx();
boolean chDir = false;
// we should look take the parent of fsOp's task as the current task.
FileSinkOperator fsOp = (FileSinkOperator) nd;
Map<Operator<? extends OperatorDesc>, GenMapRedCtx> mapCurrCtx = ctx.getMapCurrCtx();
GenMapRedCtx mapredCtx = mapCurrCtx.get(fsOp.getParentOperators().get(0));
Task<?> currTask = mapredCtx.getCurrTask();
ctx.setCurrTask(currTask);
ctx.addRootIfPossible(currTask);
// is INSERT OVERWRITE TABLE
boolean isInsertTable = GenMapRedUtils.isInsertInto(parseCtx, fsOp);
HiveConf hconf = parseCtx.getConf();
// Mark this task as a final map reduce task (ignoring the optional merge task)
((MapredWork) currTask.getWork()).setFinalMapRed(true);
// If this file sink desc has been processed due to a linked file sink desc,
// use that task
Map<FileSinkDesc, Task<?>> fileSinkDescs = ctx.getLinkedFileDescTasks();
if (fileSinkDescs != null) {
Task<?> childTask = fileSinkDescs.get(fsOp.getConf());
processLinkedFileDesc(ctx, childTask);
return true;
}
// So, no need to attempt to merge the files again.
if ((ctx.getSeenFileSinkOps() == null) || (!ctx.getSeenFileSinkOps().contains(nd))) {
chDir = GenMapRedUtils.isMergeRequired(ctx.getMvTask(), hconf, fsOp, currTask, isInsertTable);
}
Path finalName = processFS(fsOp, stack, opProcCtx, chDir);
if (chDir) {
// Merge the files in the destination table/partitions by creating Map-only merge job
// If underlying data is RCFile or OrcFile, RCFileBlockMerge task or
// OrcFileStripeMerge task would be created.
LOG.info("using CombineHiveInputformat for the merge job");
GenMapRedUtils.createMRWorkForMergingFiles(fsOp, finalName, ctx.getDependencyTaskForMultiInsert(), ctx.getMvTask(), hconf, currTask, parseCtx.getQueryState().getLineageState());
}
FileSinkDesc fileSinkDesc = fsOp.getConf();
// There are linked file sink operators and child tasks are present
if (fileSinkDesc.isLinkedFileSink() && (currTask.getChildTasks() != null) && (currTask.getChildTasks().size() == 1)) {
Map<FileSinkDesc, Task<?>> linkedFileDescTasks = ctx.getLinkedFileDescTasks();
if (linkedFileDescTasks == null) {
linkedFileDescTasks = new HashMap<FileSinkDesc, Task<?>>();
ctx.setLinkedFileDescTasks(linkedFileDescTasks);
}
for (FileSinkDesc fileDesc : fileSinkDesc.getLinkedFileSinkDesc()) {
linkedFileDescTasks.put(fileDesc, currTask.getChildTasks().get(0));
}
}
FetchTask fetchTask = parseCtx.getFetchTask();
if (fetchTask != null && currTask.getNumChild() == 0) {
if (fetchTask.isFetchFrom(fileSinkDesc)) {
currTask.setFetchSource(true);
}
}
return true;
}
use of org.apache.hadoop.hive.ql.parse.ParseContext in project hive by apache.
the class GenMapRedUtils method splitTasks.
@SuppressWarnings("nls")
private static /**
* Split two tasks by creating a temporary file between them.
*
* @param op reduce sink operator being processed
* @param parentTask the parent task
* @param childTask the child task
* @param opProcCtx context
*/
void splitTasks(ReduceSinkOperator op, Task<?> parentTask, Task<?> childTask, GenMRProcContext opProcCtx) throws SemanticException {
if (op.getNumParent() != 1) {
throw new IllegalStateException("Expecting operator " + op + " to have one parent. " + "But found multiple parents : " + op.getParentOperators());
}
ParseContext parseCtx = opProcCtx.getParseCtx();
parentTask.addDependentTask(childTask);
// Root Task cannot depend on any other task, therefore childTask cannot be
// a root Task
List<Task<?>> rootTasks = opProcCtx.getRootTasks();
if (rootTasks.contains(childTask)) {
rootTasks.remove(childTask);
}
// Generate the temporary file name
Context baseCtx = parseCtx.getContext();
Path taskTmpDir = baseCtx.getMRTmpPath();
Operator<? extends OperatorDesc> parent = op.getParentOperators().get(0);
TableDesc tt_desc = PlanUtils.getIntermediateFileTableDesc(PlanUtils.getFieldSchemasFromRowSchema(parent.getSchema(), "temporarycol"));
// Create the temporary file, its corresponding FileSinkOperaotr, and
// its corresponding TableScanOperator.
TableScanOperator tableScanOp = createTemporaryFile(parent, op, taskTmpDir, tt_desc, parseCtx);
Map<Operator<? extends OperatorDesc>, GenMapRedCtx> mapCurrCtx = opProcCtx.getMapCurrCtx();
mapCurrCtx.put(tableScanOp, new GenMapRedCtx(childTask, null));
String streamDesc = taskTmpDir.toUri().toString();
MapredWork cplan = (MapredWork) childTask.getWork();
if (needsTagging(cplan.getReduceWork())) {
Operator<? extends OperatorDesc> reducerOp = cplan.getReduceWork().getReducer();
String id = null;
if (reducerOp instanceof JoinOperator) {
if (parseCtx.getJoinOps().contains(reducerOp)) {
id = ((JoinOperator) reducerOp).getConf().getId();
}
} else if (reducerOp instanceof MapJoinOperator) {
if (parseCtx.getMapJoinOps().contains(reducerOp)) {
id = ((MapJoinOperator) reducerOp).getConf().getId();
}
} else if (reducerOp instanceof SMBMapJoinOperator) {
if (parseCtx.getSmbMapJoinOps().contains(reducerOp)) {
id = ((SMBMapJoinOperator) reducerOp).getConf().getId();
}
}
if (id != null) {
streamDesc = id + ":$INTNAME";
} else {
streamDesc = "$INTNAME";
}
String origStreamDesc = streamDesc;
int pos = 0;
while (cplan.getMapWork().getAliasToWork().get(streamDesc) != null) {
streamDesc = origStreamDesc.concat(String.valueOf(++pos));
}
// TODO: Allocate work to remove the temporary files and make that
// dependent on the redTask
cplan.getReduceWork().setNeedsTagging(true);
}
// Add the path to alias mapping
setTaskPlan(taskTmpDir, streamDesc, tableScanOp, cplan.getMapWork(), false, tt_desc);
opProcCtx.setCurrTopOp(null);
opProcCtx.setCurrAliasId(null);
opProcCtx.setCurrTask(childTask);
opProcCtx.addRootIfPossible(parentTask);
}
use of org.apache.hadoop.hive.ql.parse.ParseContext in project hive by apache.
the class GenMRUnion1 method processSubQueryUnionCreateIntermediate.
/**
* Process the union when the parent is a map-reduce job. Create a temporary
* output, and let the union task read from the temporary output.
*
* The files created for all the inputs are in the union context and later
* used to initialize the union plan
*
* @param parent
* @param child
* @param uTask
* @param ctx
* @param uCtxTask
*/
private void processSubQueryUnionCreateIntermediate(Operator<? extends OperatorDesc> parent, Operator<? extends OperatorDesc> child, Task<?> uTask, GenMRProcContext ctx, GenMRUnionCtx uCtxTask) {
ParseContext parseCtx = ctx.getParseCtx();
TableDesc tt_desc = PlanUtils.getIntermediateFileTableDesc(PlanUtils.getFieldSchemasFromRowSchema(parent.getSchema(), "temporarycol"));
// generate the temporary file
Context baseCtx = parseCtx.getContext();
Path taskTmpDir = baseCtx.getMRTmpPath();
// Create the temporary file, its corresponding FileSinkOperaotr, and
// its corresponding TableScanOperator.
TableScanOperator tableScanOp = GenMapRedUtils.createTemporaryFile(parent, child, taskTmpDir, tt_desc, parseCtx);
// Add the path to alias mapping
uCtxTask.addTaskTmpDir(taskTmpDir.toUri().toString());
uCtxTask.addTTDesc(tt_desc);
uCtxTask.addListTopOperators(tableScanOp);
// The union task is empty. The files created for all the inputs are
// assembled in the union context and later used to initialize the union
// plan
Task<?> currTask = ctx.getCurrTask();
currTask.addDependentTask(uTask);
if (ctx.getRootTasks().contains(uTask)) {
ctx.getRootTasks().remove(uTask);
if (!ctx.getRootTasks().contains(currTask) && shouldBeRootTask(currTask)) {
ctx.getRootTasks().add(currTask);
}
}
}
use of org.apache.hadoop.hive.ql.parse.ParseContext in project hive by apache.
the class GlobalLimitOptimizer method transform.
@Override
public ParseContext transform(ParseContext pctx) throws SemanticException {
Context ctx = pctx.getContext();
Map<String, TableScanOperator> topOps = pctx.getTopOps();
GlobalLimitCtx globalLimitCtx = pctx.getGlobalLimitCtx();
Map<String, SplitSample> nameToSplitSample = pctx.getNameToSplitSample();
// is used.
if (topOps.size() == 1 && !globalLimitCtx.ifHasTransformOrUDTF() && nameToSplitSample.isEmpty()) {
// Here we recursively check:
// 1. whether there are exact one LIMIT in the query
// 2. whether there is no aggregation, group-by, distinct, sort by,
// distributed by, or table sampling in any of the sub-query.
// The query only qualifies if both conditions are satisfied.
//
// Example qualified queries:
// CREATE TABLE ... AS SELECT col1, col2 FROM tbl LIMIT ..
// INSERT OVERWRITE TABLE ... SELECT col1, hash(col2), split(col1)
// FROM ... LIMIT...
// SELECT * FROM (SELECT col1 as col2 (SELECT * FROM ...) t1 LIMIT ...) t2);
//
TableScanOperator ts = topOps.values().iterator().next();
Table tab = ts.getConf().getTableMetadata();
if (tab.isNonNative()) {
LOG.info("Not enabling limit optimization on non native table: " + tab.getTableName());
return pctx;
}
// InputFormat.getSplits wont be called if no input path & TS Vertex will have 0 task parallelism
if (tab.getStorageHandler() == null) {
LimitOperator tempGlobalLimit = checkQbpForGlobalLimit(ts);
// query qualify for the optimization
if (tempGlobalLimit != null) {
LimitDesc tempGlobalLimitDesc = tempGlobalLimit.getConf();
Set<FilterOperator> filterOps = OperatorUtils.findOperators(ts, FilterOperator.class);
if (!tab.isPartitioned()) {
if (filterOps.size() == 0) {
Integer tempOffset = tempGlobalLimitDesc.getOffset();
globalLimitCtx.enableOpt(tempGlobalLimitDesc.getLimit(), (tempOffset == null) ? 0 : tempOffset);
}
} else {
// check if the pruner only contains partition columns
if (onlyContainsPartnCols(tab, filterOps)) {
String alias = (String) topOps.keySet().toArray()[0];
PrunedPartitionList partsList = pctx.getPrunedPartitions(alias, ts);
// the filter to prune correctly
if (!partsList.hasUnknownPartitions()) {
Integer tempOffset = tempGlobalLimitDesc.getOffset();
globalLimitCtx.enableOpt(tempGlobalLimitDesc.getLimit(), (tempOffset == null) ? 0 : tempOffset);
}
}
}
if (globalLimitCtx.isEnable()) {
LOG.info("Qualify the optimize that reduces input size for 'offset' for offset " + globalLimitCtx.getGlobalOffset());
LOG.info("Qualify the optimize that reduces input size for 'limit' for limit " + globalLimitCtx.getGlobalLimit());
}
}
}
}
return pctx;
}
Aggregations