use of org.apache.hadoop.hive.ql.parse.ParseContext in project hive by apache.
the class SparkCompiler method generateTaskTree.
/**
* TODO: need to turn on rules that's commented out and add more if necessary.
*/
@Override
protected void generateTaskTree(List<Task<?>> rootTasks, ParseContext pCtx, List<Task<MoveWork>> mvTask, Set<ReadEntity> inputs, Set<WriteEntity> outputs) throws SemanticException {
PERF_LOGGER.perfLogBegin(CLASS_NAME, PerfLogger.SPARK_GENERATE_TASK_TREE);
GenSparkUtils utils = GenSparkUtils.getUtils();
utils.resetSequenceNumber();
ParseContext tempParseContext = getParseContext(pCtx, rootTasks);
GenSparkProcContext procCtx = new GenSparkProcContext(conf, tempParseContext, mvTask, rootTasks, inputs, outputs, pCtx.getTopOps());
// -------------------------------- First Pass ---------------------------------- //
// Identify SparkPartitionPruningSinkOperators, and break OP tree if necessary
Map<SemanticRule, SemanticNodeProcessor> opRules = new LinkedHashMap<SemanticRule, SemanticNodeProcessor>();
opRules.put(new RuleRegExp("Clone OP tree for PartitionPruningSink", SparkPartitionPruningSinkOperator.getOperatorName() + "%"), new SplitOpTreeForDPP());
SemanticDispatcher disp = new DefaultRuleDispatcher(null, opRules, procCtx);
SemanticGraphWalker ogw = new GenSparkWorkWalker(disp, procCtx);
List<Node> topNodes = new ArrayList<Node>();
topNodes.addAll(pCtx.getTopOps().values());
ogw.startWalking(topNodes, null);
// -------------------------------- Second Pass ---------------------------------- //
// Process operator tree in two steps: first we process the extra op trees generated
// in the first pass. Then we process the main op tree, and the result task will depend
// on the task generated in the first pass.
topNodes.clear();
topNodes.addAll(procCtx.topOps.values());
generateTaskTreeHelper(procCtx, topNodes);
// the partitions used.
if (!procCtx.clonedPruningTableScanSet.isEmpty()) {
SparkTask pruningTask = SparkUtilities.createSparkTask(conf);
SparkTask mainTask = procCtx.currentTask;
pruningTask.addDependentTask(procCtx.currentTask);
procCtx.rootTasks.remove(procCtx.currentTask);
procCtx.rootTasks.add(pruningTask);
procCtx.currentTask = pruningTask;
topNodes.clear();
topNodes.addAll(procCtx.clonedPruningTableScanSet);
generateTaskTreeHelper(procCtx, topNodes);
procCtx.currentTask = mainTask;
}
// we need to clone some operator plans and remove union operators still
for (BaseWork w : procCtx.workWithUnionOperators) {
GenSparkUtils.getUtils().removeUnionOperators(procCtx, w);
}
// we need to fill MapWork with 'local' work and bucket information for SMB Join.
GenSparkUtils.getUtils().annotateMapWork(procCtx);
// finally make sure the file sink operators are set up right
for (FileSinkOperator fileSink : procCtx.fileSinkSet) {
GenSparkUtils.getUtils().processFileSink(procCtx, fileSink);
}
// Process partition pruning sinks
for (Operator<?> prunerSink : procCtx.pruningSinkSet) {
utils.processPartitionPruningSink(procCtx, (SparkPartitionPruningSinkOperator) prunerSink);
}
PERF_LOGGER.perfLogEnd(CLASS_NAME, PerfLogger.SPARK_GENERATE_TASK_TREE);
}
use of org.apache.hadoop.hive.ql.parse.ParseContext in project hive by apache.
the class SparkProcessAnalyzeTable method process.
@SuppressWarnings("unchecked")
@Override
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procContext, Object... nodeOutputs) throws SemanticException {
GenSparkProcContext context = (GenSparkProcContext) procContext;
TableScanOperator tableScan = (TableScanOperator) nd;
ParseContext parseContext = context.parseContext;
Table table = tableScan.getConf().getTableMetadata();
@SuppressWarnings("rawtypes") Class<? extends InputFormat> inputFormat = table.getInputFormatClass();
if (parseContext.getQueryProperties().isAnalyzeCommand()) {
Preconditions.checkArgument(tableScan.getChildOperators() == null || tableScan.getChildOperators().size() == 0, "AssertionError: expected tableScan.getChildOperators() to be null, " + "or tableScan.getChildOperators().size() to be 0");
String alias = null;
for (String a : parseContext.getTopOps().keySet()) {
if (tableScan == parseContext.getTopOps().get(a)) {
alias = a;
}
}
Preconditions.checkArgument(alias != null, "AssertionError: expected alias to be not null");
SparkWork sparkWork = context.currentTask.getWork();
if (BasicStatsNoJobTask.canUseBasicStats(table, inputFormat)) {
// For ORC, Parquet and Iceberg tables, all the following statements are the same
// ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS
// ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS noscan;
// There will not be any Spark job above this task
StatsWork statWork = new StatsWork(table, parseContext.getConf());
statWork.setFooterScan();
// If partition is specified, get pruned partition list
Set<Partition> confirmedParts = GenMapRedUtils.getConfirmedPartitionsForScan(tableScan);
if (confirmedParts.size() > 0) {
List<String> partCols = GenMapRedUtils.getPartitionColumns(tableScan);
PrunedPartitionList partList = new PrunedPartitionList(table, confirmedParts, partCols, false);
statWork.addInputPartitions(partList.getPartitions());
}
Task<StatsWork> snjTask = TaskFactory.get(statWork);
snjTask.setParentTasks(null);
context.rootTasks.remove(context.currentTask);
context.rootTasks.add(snjTask);
return true;
} else {
// ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS;
// The plan consists of a simple SparkTask followed by a StatsTask.
// The Spark task is just a simple TableScanOperator
BasicStatsWork basicStatsWork = new BasicStatsWork(table.getTableSpec());
basicStatsWork.setIsExplicitAnalyze(true);
basicStatsWork.setNoScanAnalyzeCommand(parseContext.getQueryProperties().isNoScanAnalyzeCommand());
StatsWork columnStatsWork = new StatsWork(table, basicStatsWork, parseContext.getConf());
columnStatsWork.collectStatsFromAggregator(tableScan.getConf());
columnStatsWork.setSourceTask(context.currentTask);
Task<StatsWork> statsTask = TaskFactory.get(columnStatsWork);
context.currentTask.addDependentTask(statsTask);
// The plan consists of a StatsTask only.
if (parseContext.getQueryProperties().isNoScanAnalyzeCommand()) {
statsTask.setParentTasks(null);
context.rootTasks.remove(context.currentTask);
context.rootTasks.add(statsTask);
}
// NOTE: here we should use the new partition predicate pushdown API to get a list of pruned list,
// and pass it to setTaskPlan as the last parameter
Set<Partition> confirmedPartns = GenMapRedUtils.getConfirmedPartitionsForScan(tableScan);
PrunedPartitionList partitions = null;
if (confirmedPartns.size() > 0) {
List<String> partCols = GenMapRedUtils.getPartitionColumns(tableScan);
partitions = new PrunedPartitionList(table, confirmedPartns, partCols, false);
}
MapWork w = utils.createMapWork(context, tableScan, sparkWork, partitions);
w.setGatheringStats(true);
return true;
}
}
return null;
}
use of org.apache.hadoop.hive.ql.parse.ParseContext in project hive by apache.
the class TestSparkUtilities method testCreateMoveTaskDoesntCreateCascadeTempDirs.
@Test
public void testCreateMoveTaskDoesntCreateCascadeTempDirs() throws Exception {
FileSinkOperator fsOp = mock(FileSinkOperator.class);
ParseContext pctx = mock(ParseContext.class);
Configuration conf = new Configuration();
conf.set("_hive.hdfs.session.path", "hdfs:/dummypath");
conf.set("_hive.local.session.path", "hdfs:/dummypath");
Context ctx = new Context(conf);
String executionId = ctx.getExecutionId();
Context ctxSpy = spy(ctx);
FileSinkDesc fileSinkDesc = mock(FileSinkDesc.class);
Path mrPath = new Path("hdfs:/tmp/.staging/" + executionId + "/-mr-10001");
Path mrPath2 = new Path("hdfs:/tmp/.staging/" + executionId + "/-mr-10002");
Path extPath = new Path("hdfs:/tmp/.staging/" + executionId + "/-ext-10001");
Path extPath2 = new Path("hdfs:/tmp/.staging/" + executionId + "/-ext-10002");
final Ref<Path> expectedPathRef = new Ref<>(mrPath);
final Ref<Path> testPathRef = new Ref<>(extPath);
doAnswer(invocationOnMock -> {
return ctxSpy;
}).when(pctx).getContext();
doAnswer(invocationOnMock -> {
return mrPath2;
}).when(ctxSpy).getMRTmpPath();
doAnswer(invocationOnMock -> {
return extPath2;
}).when(ctxSpy).getExternalTmpPath(any(Path.class));
doAnswer(invocationOnMock -> {
return testPathRef.value;
}).when(fileSinkDesc).getFinalDirName();
doAnswer(invocationOnMock -> {
return null;
}).when(fileSinkDesc).getLinkedFileSinkDesc();
doAnswer(invocationOnMock -> {
return fileSinkDesc;
}).when(fsOp).getConf();
doAnswer(invocationOnMock -> {
assertEquals(expectedPathRef.value, invocationOnMock.getArgument(0, Path.class));
return null;
}).when(fileSinkDesc).setDirName(any(Path.class));
testPathRef.value = mrPath;
expectedPathRef.value = mrPath2;
GenSparkUtils.createMoveTask(null, true, fsOp, pctx, null, null, null);
testPathRef.value = extPath;
expectedPathRef.value = extPath2;
GenSparkUtils.createMoveTask(null, true, fsOp, pctx, null, null, null);
}
use of org.apache.hadoop.hive.ql.parse.ParseContext in project hive by apache.
the class IndexWhereProcessor method process.
@Override
public /**
* Process a node of the operator tree. This matches on the rule in IndexWhereTaskDispatcher
*/
Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx, Object... nodeOutputs) throws SemanticException {
TableScanOperator operator = (TableScanOperator) nd;
List<Node> opChildren = operator.getChildren();
TableScanDesc operatorDesc = operator.getConf();
if (operatorDesc == null || !tsToIndices.containsKey(operator)) {
return null;
}
List<Index> indexes = tsToIndices.get(operator);
ExprNodeDesc predicate = operatorDesc.getFilterExpr();
IndexWhereProcCtx context = (IndexWhereProcCtx) procCtx;
ParseContext pctx = context.getParseContext();
LOG.info("Processing predicate for index optimization");
if (predicate == null) {
LOG.info("null predicate pushed down");
return null;
}
LOG.info(predicate.getExprString());
// check if we have tsToIndices on all partitions in this table scan
Set<Partition> queryPartitions;
try {
queryPartitions = IndexUtils.checkPartitionsCoveredByIndex(operator, pctx, indexes);
if (queryPartitions == null) {
// partitions not covered
return null;
}
} catch (HiveException e) {
LOG.error("Fatal Error: problem accessing metastore", e);
throw new SemanticException(e);
}
// we can only process MapReduce tasks to check input size
if (!context.getCurrentTask().isMapRedTask()) {
return null;
}
MapRedTask currentTask = (MapRedTask) context.getCurrentTask();
// get potential reentrant index queries from each index
Map<Index, HiveIndexQueryContext> queryContexts = new HashMap<Index, HiveIndexQueryContext>();
// make sure we have an index on the table being scanned
TableDesc tblDesc = operator.getTableDesc();
Map<String, List<Index>> indexesByType = new HashMap<String, List<Index>>();
for (Index indexOnTable : indexes) {
if (indexesByType.get(indexOnTable.getIndexHandlerClass()) == null) {
List<Index> newType = new ArrayList<Index>();
newType.add(indexOnTable);
indexesByType.put(indexOnTable.getIndexHandlerClass(), newType);
} else {
indexesByType.get(indexOnTable.getIndexHandlerClass()).add(indexOnTable);
}
}
// choose index type with most tsToIndices of the same type on the table
// TODO HIVE-2130 This would be a good place for some sort of cost based choice?
List<Index> bestIndexes = indexesByType.values().iterator().next();
for (List<Index> indexTypes : indexesByType.values()) {
if (bestIndexes.size() < indexTypes.size()) {
bestIndexes = indexTypes;
}
}
// rewrite index queries for the chosen index type
HiveIndexQueryContext tmpQueryContext = new HiveIndexQueryContext();
tmpQueryContext.setQueryPartitions(queryPartitions);
rewriteForIndexes(predicate, bestIndexes, pctx, currentTask, tmpQueryContext);
List<Task<?>> indexTasks = tmpQueryContext.getQueryTasks();
if (indexTasks != null && indexTasks.size() > 0) {
queryContexts.put(bestIndexes.get(0), tmpQueryContext);
}
// choose an index rewrite to use
if (queryContexts.size() > 0) {
// TODO HIVE-2130 This would be a good place for some sort of cost based choice?
Index chosenIndex = queryContexts.keySet().iterator().next();
// modify the parse context to use indexing
// we need to delay this until we choose one index so that we don't attempt to modify pctx multiple times
HiveIndexQueryContext queryContext = queryContexts.get(chosenIndex);
// prepare the map reduce job to use indexing
MapWork work = currentTask.getWork().getMapWork();
work.setInputformat(queryContext.getIndexInputFormat());
work.addIndexIntermediateFile(queryContext.getIndexIntermediateFile());
// modify inputs based on index query
Set<ReadEntity> inputs = pctx.getSemanticInputs();
inputs.addAll(queryContext.getAdditionalSemanticInputs());
List<Task<?>> chosenRewrite = queryContext.getQueryTasks();
// add dependencies so index query runs first
insertIndexQuery(pctx, context, chosenRewrite);
}
return null;
}
use of org.apache.hadoop.hive.ql.parse.ParseContext in project hive by apache.
the class IndexWhereTaskDispatcher method dispatch.
@Override
public Object dispatch(Node nd, Stack<Node> stack, Object... nodeOutputs) throws SemanticException {
Task<? extends Serializable> task = (Task<? extends Serializable>) nd;
ParseContext pctx = physicalContext.getParseContext();
// create the regex's so the walker can recognize our WHERE queries
Map<Rule, NodeProcessor> operatorRules = createOperatorRules(pctx);
// check for no indexes on any table
if (operatorRules == null) {
return null;
}
// create context so the walker can carry the current task with it.
IndexWhereProcCtx indexWhereOptimizeCtx = new IndexWhereProcCtx(task, pctx);
// create the dispatcher, which fires the processor according to the rule that
// best matches
Dispatcher dispatcher = new DefaultRuleDispatcher(getDefaultProcessor(), operatorRules, indexWhereOptimizeCtx);
// walk the mapper operator(not task) tree for each specific task
GraphWalker ogw = new DefaultGraphWalker(dispatcher);
ArrayList<Node> topNodes = new ArrayList<Node>();
if (task.getWork() instanceof MapredWork) {
topNodes.addAll(((MapredWork) task.getWork()).getMapWork().getAliasToWork().values());
} else {
return null;
}
ogw.startWalking(topNodes, null);
return null;
}
Aggregations