use of org.apache.hadoop.hive.ql.exec.TableScanOperator in project hive by apache.
the class TezCompiler method connect.
private void connect(Operator<?> o, AtomicInteger index, Stack<Operator<?>> nodes, Map<Operator<?>, Integer> indexes, Map<Operator<?>, Integer> lowLinks, Set<Set<Operator<?>>> components, ParseContext parseContext) {
indexes.put(o, index.get());
lowLinks.put(o, index.get());
index.incrementAndGet();
nodes.push(o);
List<Operator<?>> children;
if (o instanceof AppMasterEventOperator) {
children = new ArrayList<Operator<?>>();
children.addAll(o.getChildOperators());
TableScanOperator ts = ((DynamicPruningEventDesc) o.getConf()).getTableScan();
LOG.debug("Adding special edge: " + o.getName() + " --> " + ts.toString());
children.add(ts);
} else if (o instanceof ReduceSinkOperator) {
// semijoin case
children = new ArrayList<Operator<?>>();
children.addAll(o.getChildOperators());
SemiJoinBranchInfo sjInfo = parseContext.getRsToSemiJoinBranchInfo().get(o);
if (sjInfo != null) {
TableScanOperator ts = sjInfo.getTsOp();
LOG.debug("Adding special edge: " + o.getName() + " --> " + ts.toString());
children.add(ts);
}
} else {
children = o.getChildOperators();
}
for (Operator<?> child : children) {
if (!indexes.containsKey(child)) {
connect(child, index, nodes, indexes, lowLinks, components, parseContext);
lowLinks.put(o, Math.min(lowLinks.get(o), lowLinks.get(child)));
} else if (nodes.contains(child)) {
lowLinks.put(o, Math.min(lowLinks.get(o), indexes.get(child)));
}
}
if (lowLinks.get(o).equals(indexes.get(o))) {
Set<Operator<?>> component = new LinkedHashSet<Operator<?>>();
components.add(component);
Operator<?> current;
do {
current = nodes.pop();
component.add(current);
} while (current != o);
}
}
use of org.apache.hadoop.hive.ql.exec.TableScanOperator in project hive by apache.
the class GenTezUtils method processDynamicSemiJoinPushDownOperator.
public static void processDynamicSemiJoinPushDownOperator(GenTezProcContext procCtx, RuntimeValuesInfo runtimeValuesInfo, ReduceSinkOperator rs) throws SemanticException {
SemiJoinBranchInfo sjInfo = procCtx.parseContext.getRsToSemiJoinBranchInfo().get(rs);
List<BaseWork> rsWorkList = procCtx.childToWorkMap.get(rs);
if (sjInfo == null || rsWorkList == null) {
// detection logic. Nothing to do here.
return;
}
if (rsWorkList.size() != 1) {
StringBuilder sb = new StringBuilder();
for (BaseWork curWork : rsWorkList) {
if (sb.length() > 0) {
sb.append(", ");
}
sb.append(curWork.getName());
}
throw new SemanticException(rs + " belongs to multiple BaseWorks: " + sb.toString());
}
TableScanOperator ts = sjInfo.getTsOp();
LOG.debug("ResduceSink " + rs + " to TableScan " + ts);
BaseWork parentWork = rsWorkList.get(0);
BaseWork childWork = procCtx.rootToWorkMap.get(ts);
// Connect parent/child work with a brodacast edge.
LOG.debug("Connecting Baswork - " + parentWork.getName() + " to " + childWork.getName());
TezEdgeProperty edgeProperty = new TezEdgeProperty(EdgeType.BROADCAST_EDGE);
TezWork tezWork = procCtx.currentTask.getWork();
tezWork.connect(parentWork, childWork, edgeProperty);
// Set output names in ReduceSink
rs.getConf().setOutputName(childWork.getName());
// Set up the dynamic values in the childWork.
RuntimeValuesInfo childRuntimeValuesInfo = new RuntimeValuesInfo();
childRuntimeValuesInfo.setTableDesc(runtimeValuesInfo.getTableDesc());
childRuntimeValuesInfo.setDynamicValueIDs(runtimeValuesInfo.getDynamicValueIDs());
childRuntimeValuesInfo.setColExprs(runtimeValuesInfo.getColExprs());
childWork.setInputSourceToRuntimeValuesInfo(parentWork.getName(), childRuntimeValuesInfo);
}
use of org.apache.hadoop.hive.ql.exec.TableScanOperator in project hive by apache.
the class GenTezUtils method removeUnionOperators.
// removes any union operator and clones the plan
public static void removeUnionOperators(GenTezProcContext context, BaseWork work, int indexForTezUnion) throws SemanticException {
List<Operator<?>> roots = new ArrayList<Operator<?>>();
roots.addAll(work.getAllRootOperators());
if (work.getDummyOps() != null) {
roots.addAll(work.getDummyOps());
}
roots.addAll(context.eventOperatorSet);
// need to clone the plan.
List<Operator<?>> newRoots = SerializationUtilities.cloneOperatorTree(roots, indexForTezUnion);
// we're cloning the operator plan but we're retaining the original work. That means
// that root operators have to be replaced with the cloned ops. The replacement map
// tells you what that mapping is.
BiMap<Operator<?>, Operator<?>> replacementMap = HashBiMap.create();
// there's some special handling for dummyOps required. Mapjoins won't be properly
// initialized if their dummy parents aren't initialized. Since we cloned the plan
// we need to replace the dummy operators in the work with the cloned ones.
List<HashTableDummyOperator> dummyOps = new LinkedList<HashTableDummyOperator>();
Iterator<Operator<?>> it = newRoots.iterator();
for (Operator<?> orig : roots) {
Set<FileSinkOperator> fsOpSet = OperatorUtils.findOperators(orig, FileSinkOperator.class);
for (FileSinkOperator fsOp : fsOpSet) {
context.fileSinkSet.remove(fsOp);
}
Operator<?> newRoot = it.next();
replacementMap.put(orig, newRoot);
if (newRoot instanceof HashTableDummyOperator) {
// dummy ops need to be updated to the cloned ones.
dummyOps.add((HashTableDummyOperator) newRoot);
it.remove();
} else if (newRoot instanceof AppMasterEventOperator) {
// need to restore the original scan.
if (newRoot.getConf() instanceof DynamicPruningEventDesc) {
TableScanOperator ts = ((DynamicPruningEventDesc) orig.getConf()).getTableScan();
if (ts == null) {
throw new AssertionError("No table scan associated with dynamic event pruning. " + orig);
}
((DynamicPruningEventDesc) newRoot.getConf()).setTableScan(ts);
}
it.remove();
} else {
if (newRoot instanceof TableScanOperator) {
if (context.tsToEventMap.containsKey(orig)) {
// we need to update event operators with the cloned table scan
for (AppMasterEventOperator event : context.tsToEventMap.get(orig)) {
((DynamicPruningEventDesc) event.getConf()).setTableScan((TableScanOperator) newRoot);
}
}
// This TableScanOperator could be part of semijoin optimization.
Map<ReduceSinkOperator, SemiJoinBranchInfo> rsToSemiJoinBranchInfo = context.parseContext.getRsToSemiJoinBranchInfo();
for (ReduceSinkOperator rs : rsToSemiJoinBranchInfo.keySet()) {
SemiJoinBranchInfo sjInfo = rsToSemiJoinBranchInfo.get(rs);
if (sjInfo.getTsOp() == orig) {
SemiJoinBranchInfo newSJInfo = new SemiJoinBranchInfo((TableScanOperator) newRoot, sjInfo.getIsHint());
rsToSemiJoinBranchInfo.put(rs, newSJInfo);
}
}
}
context.rootToWorkMap.remove(orig);
context.rootToWorkMap.put(newRoot, work);
}
}
// now we remove all the unions. we throw away any branch that's not reachable from
// the current set of roots. The reason is that those branches will be handled in
// different tasks.
Deque<Operator<?>> operators = new LinkedList<Operator<?>>();
operators.addAll(newRoots);
Set<Operator<?>> seen = new HashSet<Operator<?>>();
while (!operators.isEmpty()) {
Operator<?> current = operators.pop();
seen.add(current);
if (current instanceof FileSinkOperator) {
FileSinkOperator fileSink = (FileSinkOperator) current;
// remember it for additional processing later
context.fileSinkSet.add(fileSink);
FileSinkDesc desc = fileSink.getConf();
Path path = desc.getDirName();
List<FileSinkDesc> linked;
if (!context.linkedFileSinks.containsKey(path)) {
linked = new ArrayList<FileSinkDesc>();
context.linkedFileSinks.put(path, linked);
}
linked = context.linkedFileSinks.get(path);
linked.add(desc);
desc.setDirName(new Path(path, AbstractFileMergeOperator.UNION_SUDBIR_PREFIX + linked.size()));
Utilities.FILE_OP_LOGGER.debug("removing union - new desc with " + desc.getDirName() + "; parent " + path);
desc.setLinkedFileSink(true);
desc.setLinkedFileSinkDesc(linked);
}
if (current instanceof AppMasterEventOperator) {
// remember for additional processing later
context.eventOperatorSet.add((AppMasterEventOperator) current);
// mark the original as abandoned. Don't need it anymore.
context.abandonedEventOperatorSet.add((AppMasterEventOperator) replacementMap.inverse().get(current));
}
if (current instanceof UnionOperator) {
Operator<?> parent = null;
int count = 0;
for (Operator<?> op : current.getParentOperators()) {
if (seen.contains(op)) {
++count;
parent = op;
}
}
// we should have been able to reach the union from only one side.
assert count <= 1;
if (parent == null) {
// root operator is union (can happen in reducers)
replacementMap.put(current, current.getChildOperators().get(0));
} else {
parent.removeChildAndAdoptItsChildren(current);
}
}
if (current instanceof FileSinkOperator || current instanceof ReduceSinkOperator) {
current.setChildOperators(null);
} else {
operators.addAll(current.getChildOperators());
}
}
LOG.debug("Setting dummy ops for work " + work.getName() + ": " + dummyOps);
work.setDummyOps(dummyOps);
work.replaceRoots(replacementMap);
}
use of org.apache.hadoop.hive.ql.exec.TableScanOperator in project hive by apache.
the class ProcessAnalyzeTable method process.
@SuppressWarnings("unchecked")
@Override
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procContext, Object... nodeOutputs) throws SemanticException {
GenTezProcContext context = (GenTezProcContext) procContext;
TableScanOperator tableScan = (TableScanOperator) nd;
ParseContext parseContext = context.parseContext;
Table table = tableScan.getConf().getTableMetadata();
Class<? extends InputFormat> inputFormat = table.getInputFormatClass();
if (parseContext.getQueryProperties().isAnalyzeCommand()) {
assert tableScan.getChildOperators() == null || tableScan.getChildOperators().size() == 0;
String alias = null;
for (String a : parseContext.getTopOps().keySet()) {
if (tableScan == parseContext.getTopOps().get(a)) {
alias = a;
}
}
assert alias != null;
TezWork tezWork = context.currentTask.getWork();
if (OrcInputFormat.class.isAssignableFrom(inputFormat) || MapredParquetInputFormat.class.isAssignableFrom(inputFormat)) {
// For ORC & Parquet, all the following statements are the same
// ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS
// ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS noscan;
// There will not be any Tez job above this task
StatsWork statWork = new StatsWork(table, parseContext.getConf());
statWork.setFooterScan();
// If partition is specified, get pruned partition list
Set<Partition> confirmedParts = GenMapRedUtils.getConfirmedPartitionsForScan(tableScan);
if (confirmedParts.size() > 0) {
List<String> partCols = GenMapRedUtils.getPartitionColumns(tableScan);
PrunedPartitionList partList = new PrunedPartitionList(table, confirmedParts, partCols, false);
statWork.addInputPartitions(partList.getPartitions());
}
Task<StatsWork> snjTask = TaskFactory.get(statWork);
snjTask.setParentTasks(null);
context.rootTasks.remove(context.currentTask);
context.rootTasks.add(snjTask);
return true;
} else {
// ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS;
// The plan consists of a simple TezTask followed by a StatsTask.
// The Tez task is just a simple TableScanOperator
BasicStatsWork basicStatsWork = new BasicStatsWork(table.getTableSpec());
basicStatsWork.setNoScanAnalyzeCommand(parseContext.getQueryProperties().isNoScanAnalyzeCommand());
StatsWork columnStatsWork = new StatsWork(table, basicStatsWork, parseContext.getConf());
columnStatsWork.collectStatsFromAggregator(tableScan.getConf());
columnStatsWork.setSourceTask(context.currentTask);
Task<StatsWork> statsTask = TaskFactory.get(columnStatsWork);
context.currentTask.addDependentTask(statsTask);
// The plan consists of a StatsTask only.
if (parseContext.getQueryProperties().isNoScanAnalyzeCommand()) {
statsTask.setParentTasks(null);
context.rootTasks.remove(context.currentTask);
context.rootTasks.add(statsTask);
}
// NOTE: here we should use the new partition predicate pushdown API to
// get a list of pruned list,
// and pass it to setTaskPlan as the last parameter
Set<Partition> confirmedPartns = GenMapRedUtils.getConfirmedPartitionsForScan(tableScan);
PrunedPartitionList partitions = null;
if (confirmedPartns.size() > 0) {
List<String> partCols = GenMapRedUtils.getPartitionColumns(tableScan);
partitions = new PrunedPartitionList(table, confirmedPartns, partCols, false);
}
MapWork w = utils.createMapWork(context, tableScan, tezWork, partitions);
w.setGatheringStats(true);
return true;
}
}
return null;
}
use of org.apache.hadoop.hive.ql.exec.TableScanOperator in project hive by apache.
the class GenSparkUtils method processPartitionPruningSink.
/**
* Populate partition pruning information from the pruning sink operator to the
* target MapWork (the MapWork for the big table side). The information include the source table
* name, column name, and partition key expression. It also set up the temporary path used to
* communicate between the target MapWork and source BaseWork.
*
* Here "source" refers to the small table side, while "target" refers to the big
* table side.
*
* @param context the spark context.
* @param pruningSink the pruner sink operator being processed.
*/
public void processPartitionPruningSink(GenSparkProcContext context, SparkPartitionPruningSinkOperator pruningSink) {
SparkPartitionPruningSinkDesc desc = pruningSink.getConf();
final Path outputBase = getDPPOutputPath(context.parseContext.getContext());
final String sourceId = pruningSink.getUniqueId();
desc.setPath(new Path(outputBase, sourceId));
for (SparkPartitionPruningSinkDesc.DPPTargetInfo targetInfo : desc.getTargetInfos()) {
TableScanOperator ts = targetInfo.tableScan;
MapWork targetWork = (MapWork) context.rootToWorkMap.get(ts);
Preconditions.checkNotNull(targetWork, "No targetWork found for tablescan " + ts);
// set up temporary path to communicate between the small/big table
if (targetWork.getTmpPathForPartitionPruning() == null) {
targetWork.setTmpPathForPartitionPruning(outputBase);
LOG.info("Setting tmp path between source work and target work:\n" + outputBase);
}
targetInfo.work = targetWork;
targetInfo.columnName = SparkUtilities.getWorkId(targetWork) + ":" + targetInfo.columnName;
pruningSink.addAsSourceEvent(targetWork, targetInfo.partKey, targetInfo.columnName, targetInfo.columnType);
}
}
Aggregations