use of org.apache.hadoop.hive.ql.parse.ParseContext in project hive by apache.
the class GenMRTableScan1 method process.
/**
* Table Sink encountered.
* @param nd
* the table sink operator encountered
* @param opProcCtx
* context
*/
@Override
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx opProcCtx, Object... nodeOutputs) throws SemanticException {
TableScanOperator op = (TableScanOperator) nd;
GenMRProcContext ctx = (GenMRProcContext) opProcCtx;
ctx.reset();
ParseContext parseCtx = ctx.getParseCtx();
Table table = op.getConf().getTableMetadata();
Class<? extends InputFormat> inputFormat = table.getInputFormatClass();
Map<Operator<? extends OperatorDesc>, GenMapRedCtx> mapCurrCtx = ctx.getMapCurrCtx();
// create a dummy MapReduce task
MapredWork currWork = GenMapRedUtils.getMapRedWork(parseCtx);
MapRedTask currTask = (MapRedTask) TaskFactory.get(currWork);
ctx.setCurrTask(currTask);
ctx.setCurrTopOp(op);
for (String alias : parseCtx.getTopOps().keySet()) {
Operator<? extends OperatorDesc> currOp = parseCtx.getTopOps().get(alias);
if (currOp == op) {
String currAliasId = alias;
ctx.setCurrAliasId(currAliasId);
mapCurrCtx.put(op, new GenMapRedCtx(currTask, currAliasId));
if (parseCtx.getQueryProperties().isAnalyzeCommand()) {
boolean noScan = parseCtx.getQueryProperties().isNoScanAnalyzeCommand();
if (BasicStatsNoJobTask.canUseBasicStats(table, inputFormat)) {
// For ORC and Parquet, all the following statements are the same
// ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS
// ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS noscan;
// There will not be any MR or Tez job above this task
StatsWork statWork = new StatsWork(table, parseCtx.getConf());
statWork.setFooterScan();
// If partition is specified, get pruned partition list
Set<Partition> confirmedParts = GenMapRedUtils.getConfirmedPartitionsForScan(op);
if (confirmedParts.size() > 0) {
List<String> partCols = GenMapRedUtils.getPartitionColumns(op);
PrunedPartitionList partList = new PrunedPartitionList(table, confirmedParts, partCols, false);
statWork.addInputPartitions(partList.getPartitions());
}
Task<StatsWork> snjTask = TaskFactory.get(statWork);
ctx.setCurrTask(snjTask);
ctx.setCurrTopOp(null);
ctx.getRootTasks().clear();
ctx.getRootTasks().add(snjTask);
} else {
// ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS;
// The plan consists of a simple MapRedTask followed by a StatsTask.
// The MR task is just a simple TableScanOperator
BasicStatsWork statsWork = new BasicStatsWork(table.getTableSpec());
statsWork.setIsExplicitAnalyze(true);
statsWork.setNoScanAnalyzeCommand(noScan);
StatsWork columnStatsWork = new StatsWork(table, statsWork, parseCtx.getConf());
columnStatsWork.collectStatsFromAggregator(op.getConf());
columnStatsWork.setSourceTask(currTask);
Task<StatsWork> columnStatsTask = TaskFactory.get(columnStatsWork);
currTask.addDependentTask(columnStatsTask);
if (!ctx.getRootTasks().contains(currTask)) {
ctx.getRootTasks().add(currTask);
}
// The plan consists of a StatsTask only.
if (noScan) {
columnStatsTask.setParentTasks(null);
ctx.getRootTasks().remove(currTask);
ctx.getRootTasks().add(columnStatsTask);
}
currWork.getMapWork().setGatheringStats(true);
if (currWork.getReduceWork() != null) {
currWork.getReduceWork().setGatheringStats(true);
}
// NOTE: here we should use the new partition predicate pushdown API to get a list of
// pruned list,
// and pass it to setTaskPlan as the last parameter
Set<Partition> confirmedPartns = GenMapRedUtils.getConfirmedPartitionsForScan(op);
if (confirmedPartns.size() > 0) {
List<String> partCols = GenMapRedUtils.getPartitionColumns(op);
PrunedPartitionList partList = new PrunedPartitionList(table, confirmedPartns, partCols, false);
GenMapRedUtils.setTaskPlan(currAliasId, op, currTask, false, ctx, partList);
} else {
// non-partitioned table
GenMapRedUtils.setTaskPlan(currAliasId, op, currTask, false, ctx);
}
}
}
return true;
}
}
assert false;
return null;
}
use of org.apache.hadoop.hive.ql.parse.ParseContext in project hive by apache.
the class GenMRUnion1 method process.
/**
* Union Operator encountered . Currently, the algorithm is pretty simple: If
* all the sub-queries are map-only, don't do anything. Otherwise, insert a
* FileSink on top of all the sub-queries.
*
* This can be optimized later on.
*
* @param nd
* the file sink operator encountered
* @param opProcCtx
* context
*/
@Override
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx opProcCtx, Object... nodeOutputs) throws SemanticException {
UnionOperator union = (UnionOperator) nd;
GenMRProcContext ctx = (GenMRProcContext) opProcCtx;
ParseContext parseCtx = ctx.getParseCtx();
UnionProcContext uCtx = parseCtx.getUCtx();
// Map-only subqueries can be optimized in future to not write to a file in
// future
Map<Operator<? extends OperatorDesc>, GenMapRedCtx> mapCurrCtx = ctx.getMapCurrCtx();
if (union.getConf().isAllInputsInSameReducer()) {
// All inputs of this UnionOperator are in the same Reducer.
// We do not need to break the operator tree.
mapCurrCtx.put((Operator<? extends OperatorDesc>) nd, new GenMapRedCtx(ctx.getCurrTask(), ctx.getCurrAliasId()));
return null;
}
UnionParseContext uPrsCtx = uCtx.getUnionParseContext(union);
ctx.setCurrUnionOp(union);
// map-reduce job
if (uPrsCtx.allMapOnlySubQ()) {
return processMapOnlyUnion(union, stack, ctx, uCtx);
}
assert uPrsCtx != null;
Task<?> currTask = ctx.getCurrTask();
int pos = UnionProcFactory.getPositionParent(union, stack);
Task<?> uTask = null;
MapredWork uPlan = null;
// union is encountered for the first time
GenMRUnionCtx uCtxTask = ctx.getUnionTask(union);
if (uCtxTask == null) {
uPlan = GenMapRedUtils.getMapRedWork(parseCtx);
uTask = TaskFactory.get(uPlan);
uCtxTask = new GenMRUnionCtx(uTask);
ctx.setUnionTask(union, uCtxTask);
} else {
uTask = uCtxTask.getUTask();
}
// Copy into the current union task plan if
if (uPrsCtx.getMapOnlySubq(pos) && uPrsCtx.getRootTask(pos)) {
processSubQueryUnionMerge(ctx, uCtxTask, union, stack);
if (ctx.getRootTasks().contains(currTask)) {
ctx.getRootTasks().remove(currTask);
}
} else // If it a map-reduce job, create a temporary file
{
// is the current task a root task
if (shouldBeRootTask(currTask) && !ctx.getRootTasks().contains(currTask) && (currTask.getParentTasks() == null || currTask.getParentTasks().isEmpty())) {
ctx.getRootTasks().add(currTask);
}
processSubQueryUnionCreateIntermediate(union.getParentOperators().get(pos), union, uTask, ctx, uCtxTask);
// the currAliasId and CurrTopOp is not valid any more
ctx.setCurrAliasId(null);
ctx.setCurrTopOp(null);
ctx.getOpTaskMap().put(null, uTask);
}
ctx.setCurrTask(uTask);
mapCurrCtx.put((Operator<? extends OperatorDesc>) nd, new GenMapRedCtx(ctx.getCurrTask(), null));
return true;
}
use of org.apache.hadoop.hive.ql.parse.ParseContext in project hive by apache.
the class GenMapRedUtils method createMoveTask.
/**
* Create and add any dependent move tasks
*
* @param currTask
* @param chDir
* @param fsOp
* @param parseCtx
* @param mvTasks
* @param hconf
* @param dependencyTask
* @return
*/
public static Path createMoveTask(Task<?> currTask, boolean chDir, FileSinkOperator fsOp, ParseContext parseCtx, List<Task<MoveWork>> mvTasks, HiveConf hconf, DependencyCollectionTask dependencyTask) {
Path dest = null;
FileSinkDesc fileSinkDesc = fsOp.getConf();
boolean isMmTable = fileSinkDesc.isMmTable();
boolean isDirectInsert = fileSinkDesc.isDirectInsert();
if (chDir) {
dest = fileSinkDesc.getMergeInputDirName();
/**
* Skip temporary file generation for:
* 1. MM Tables
* 2. INSERT operation on full ACID table
*/
if (!isMmTable && !isDirectInsert) {
// generate the temporary file
// it must be on the same file system as the current destination
Context baseCtx = parseCtx.getContext();
// Create the required temporary file in the HDFS location if the destination
// path of the FileSinkOperator table is a blobstore path.
Path tmpDir = baseCtx.getTempDirForFinalJobPath(fileSinkDesc.getDestPath());
// Change all the linked file sink descriptors
if (fileSinkDesc.isLinkedFileSink()) {
for (FileSinkDesc fsConf : fileSinkDesc.getLinkedFileSinkDesc()) {
fsConf.setDirName(new Path(tmpDir, fsConf.getDirName().getName()));
if (Utilities.FILE_OP_LOGGER.isTraceEnabled()) {
Utilities.FILE_OP_LOGGER.trace("createMoveTask setting tmpDir for LinkedFileSink chDir " + fsConf.getDirName() + "; dest was " + fileSinkDesc.getDestPath());
}
}
} else {
fileSinkDesc.setDirName(tmpDir);
if (Utilities.FILE_OP_LOGGER.isTraceEnabled()) {
Utilities.FILE_OP_LOGGER.trace("createMoveTask setting tmpDir chDir " + tmpDir + "; dest was " + fileSinkDesc.getDestPath());
}
}
}
}
Task<MoveWork> mvTask = null;
if (!chDir) {
mvTask = GenMapRedUtils.findMoveTaskForFsopOutput(mvTasks, fsOp.getConf().getFinalDirName(), isMmTable, isDirectInsert, fsOp.getConf().getMoveTaskId(), fsOp.getConf().getAcidOperation());
}
// Set the move task to be dependent on the current task
if (mvTask != null) {
GenMapRedUtils.addDependentMoveTasks(mvTask, hconf, currTask, dependencyTask);
}
return dest;
}
use of org.apache.hadoop.hive.ql.parse.ParseContext in project hive by apache.
the class SparkMapJoinOptimizer method convertJoinBucketMapJoin.
private int convertJoinBucketMapJoin(JoinOperator joinOp, MapJoinOperator mapJoinOp, OptimizeSparkProcContext context, int bigTablePosition) throws SemanticException {
ParseContext parseContext = context.getParseContext();
List<String> joinAliases = new ArrayList<String>();
String baseBigAlias = null;
Map<Integer, Set<String>> posToAliasMap = joinOp.getPosToAliasMap();
for (Map.Entry<Integer, Set<String>> entry : posToAliasMap.entrySet()) {
if (entry.getKey().intValue() == bigTablePosition) {
baseBigAlias = entry.getValue().iterator().next();
}
for (String alias : entry.getValue()) {
if (!joinAliases.contains(alias)) {
joinAliases.add(alias);
}
}
}
mapJoinOp.setPosToAliasMap(posToAliasMap);
BucketMapjoinProc.checkAndConvertBucketMapJoin(parseContext, mapJoinOp, baseBigAlias, joinAliases);
MapJoinDesc joinDesc = mapJoinOp.getConf();
return joinDesc.isBucketMapJoin() ? joinDesc.getBigTableBucketNumMapping().size() : -1;
}
use of org.apache.hadoop.hive.ql.parse.ParseContext in project hive by apache.
the class TestNullScanTaskDispatcher method setup.
@Before
public void setup() {
hiveConf = new HiveConf();
hiveConf.set("fs.mock.impl", MockFileSystem.class.getName());
hiveConf.setBoolVar(HiveConf.ConfVars.HIVEMETADATAONLYQUERIES, true);
sessionState = SessionState.start(hiveConf);
parseContext = spy(new ParseContext());
context = new Context(hiveConf);
parseContext.setTopOps(aliasToWork);
mapWork.setAliasToWork(aliasToWork);
createReduceWork();
}
Aggregations