use of org.apache.hadoop.hive.ql.exec.TableScanOperator in project hive by apache.
the class SparkUtilities method removeNestedDPP.
* For DPP sinks w/ common join, we'll split the tree and what's above the branching
* operator is computed multiple times. Therefore it may not be good for performance to support
* nested DPP sinks, i.e. one DPP sink depends on other DPP sinks.
* The following is an example:
* | |
* ... FIL
* | | \
* \ / |
* | / \ |
* \ / |
* |
* DPP1
* where DPP1 depends on DPP2.
* To avoid such case, we'll visit all the branching operators. If a branching operator has any
* further away DPP branches w/ common join in its sub-tree, such branches will be removed.
* In the above example, the branch of DPP1 will be removed.
public static void removeNestedDPP(OptimizeSparkProcContext procContext) {
Set<SparkPartitionPruningSinkOperator> allDPPs = new HashSet<>();
Set<Operator<?>> seen = new HashSet<>();
// collect all DPP sinks
for (TableScanOperator root : procContext.getParseContext().getTopOps().values()) {
SparkUtilities.collectOp(root, SparkPartitionPruningSinkOperator.class, allDPPs, seen);
// collect all branching operators
Set<Operator<?>> branchingOps = new HashSet<>();
for (SparkPartitionPruningSinkOperator dpp : allDPPs) {
// remember the branching ops we have visited
Set<Operator<?>> visited = new HashSet<>();
for (Operator<?> branchingOp : branchingOps) {
if (!visited.contains(branchingOp)) {
Set<SparkPartitionPruningSinkOperator> nestedDPPs = new HashSet<>();
for (Operator<?> branch : branchingOp.getChildOperators()) {
if (!isDirectDPPBranch(branch)) {
SparkUtilities.collectOp(branch, SparkPartitionPruningSinkOperator.class, nestedDPPs, seen);
for (SparkPartitionPruningSinkOperator nestedDPP : nestedDPPs) {
// if a DPP is with MJ, the tree won't be split and so we don't have to remove it
if (!nestedDPP.isWithMapjoin()) {
use of org.apache.hadoop.hive.ql.exec.TableScanOperator in project hive by apache.
the class Driver method recordValidWriteIds.
// Write the current set of valid write ids for the operated acid tables into the conf file so
// that it can be read by the input format.
private void recordValidWriteIds(HiveTxnManager txnMgr) throws LockException {
String txnString = conf.get(ValidTxnList.VALID_TXNS_KEY);
if ((txnString == null) || (txnString.isEmpty())) {
throw new IllegalStateException("calling recordValidWritsIdss() without initializing ValidTxnList " + JavaUtils.txnIdToString(txnMgr.getCurrentTxnId()));
ValidTxnWriteIdList txnWriteIds = txnMgr.getValidWriteIds(getTransactionalTableList(plan), txnString);
String writeIdStr = txnWriteIds.toString();
conf.set(ValidTxnWriteIdList.VALID_TABLES_WRITEIDS_KEY, writeIdStr);
if (plan.getFetchTask() != null) {
* This is needed for {@link HiveConf.ConfVars.HIVEFETCHTASKCONVERSION} optimization which
* initializes JobConf in FetchOperator before recordValidTxns() but this has to be done
* after locks are acquired to avoid race conditions in ACID.
* This case is supported only for single source query.
Operator<?> source = plan.getFetchTask().getWork().getSource();
if (source instanceof TableScanOperator) {
TableScanOperator tsOp = (TableScanOperator) source;
String fullTableName = AcidUtils.getFullTableName(tsOp.getConf().getDatabaseName(), tsOp.getConf().getTableName());
ValidWriteIdList writeIdList = txnWriteIds.getTableValidWriteIdList(fullTableName);
if (tsOp.getConf().isTranscationalTable() && (writeIdList == null)) {
throw new IllegalStateException("ACID table: " + fullTableName + " is missing from the ValidWriteIdList config: " + writeIdStr);
if (writeIdList != null) {
LOG.debug("Encoding valid txn write ids info " + writeIdStr + " txnid:" + txnMgr.getCurrentTxnId());
use of org.apache.hadoop.hive.ql.exec.TableScanOperator in project hive by apache.
the class SemanticAnalyzer method genTablePlan.
private Operator genTablePlan(String alias, QB qb) throws SemanticException {
String alias_id = getAliasId(alias, qb);
Table tab = qb.getMetaData().getSrcForAlias(alias);
RowResolver rwsch;
// is the table already present
TableScanOperator top = topOps.get(alias_id);
// Obtain table props in query
Map<String, String> properties = qb.getTabPropsForAlias(alias);
if (top == null) {
// Determine row schema for TSOP.
// Include column names from SerDe, the partition and virtual columns.
rwsch = new RowResolver();
try {
// Including parameters passed in the query
if (properties != null) {
for (Entry<String, String> prop : properties.entrySet()) {
if (tab.getSerdeParam(prop.getKey()) != null) {
LOG.warn("SerDe property in input query overrides stored SerDe property");
tab.setSerdeParam(prop.getKey(), prop.getValue());
// Obtain inspector for schema
StructObjectInspector rowObjectInspector = (StructObjectInspector) tab.getDeserializer().getObjectInspector();
List<? extends StructField> fields = rowObjectInspector.getAllStructFieldRefs();
for (int i = 0; i < fields.size(); i++) {
* if the column is a skewed column, use ColumnInfo accordingly
ColumnInfo colInfo = new ColumnInfo(fields.get(i).getFieldName(), TypeInfoUtils.getTypeInfoFromObjectInspector(fields.get(i).getFieldObjectInspector()), alias, false);
colInfo.setSkewedCol((isSkewedCol(alias, qb, fields.get(i).getFieldName())) ? true : false);
rwsch.put(alias, fields.get(i).getFieldName(), colInfo);
} catch (SerDeException e) {
throw new RuntimeException(e);
// Finally add the partitioning columns
for (FieldSchema part_col : tab.getPartCols()) {
LOG.trace("Adding partition col: " + part_col);
rwsch.put(alias, part_col.getName(), new ColumnInfo(part_col.getName(), TypeInfoFactory.getPrimitiveTypeInfo(part_col.getType()), alias, true));
// put all virtual columns in RowResolver.
Iterator<VirtualColumn> vcs = VirtualColumn.getRegistry(conf).iterator();
// use a list for easy cumtomize
List<VirtualColumn> vcList = new ArrayList<VirtualColumn>();
while (vcs.hasNext()) {
VirtualColumn vc =;
rwsch.put(alias, vc.getName().toLowerCase(), new ColumnInfo(vc.getName(), vc.getTypeInfo(), alias, true, vc.getIsHidden()));
// Create the root of the operator tree
TableScanDesc tsDesc = new TableScanDesc(alias, vcList, tab);
setupStats(tsDesc, qb.getParseInfo(), tab, alias, rwsch);
SplitSample sample = nameToSplitSample.get(alias_id);
if (sample != null && sample.getRowCount() != null) {
top = (TableScanOperator) putOpInsertMap(OperatorFactory.get(getOpContext(), tsDesc, new RowSchema(rwsch.getColumnInfos())), rwsch);
// Set insiderView so that we can skip the column authorization for this.
top.setInsideView(qb.isInsideView() || qb.getAliasInsideView().contains(alias.toLowerCase()));
// Add this to the list of top operators - we always start from a table
// scan
topOps.put(alias_id, top);
// Add a mapping from the table scan operator to Table
topToTable.put(top, tab);
if (properties != null) {
topToTableProps.put(top, properties);
} else {
rwsch = opParseCtx.get(top).getRowResolver();
// check if this table is sampled and needs more than input pruning
Operator<? extends OperatorDesc> op = top;
TableSample ts = qb.getParseInfo().getTabSample(alias);
if (ts != null) {
TableScanOperator tableScanOp = top;
int num = ts.getNumerator();
int den = ts.getDenominator();
ArrayList<ASTNode> sampleExprs = ts.getExprs();
// TODO: Do the type checking of the expressions
List<String> tabBucketCols = tab.getBucketCols();
int numBuckets = tab.getNumBuckets();
// If there are no sample cols and no bucket cols then throw an error
if (tabBucketCols.size() == 0 && sampleExprs.size() == 0) {
throw new SemanticException(ErrorMsg.NON_BUCKETED_TABLE.getMsg() + " " + tab.getTableName());
if (num > den) {
throw new SemanticException(ErrorMsg.BUCKETED_NUMERATOR_BIGGER_DENOMINATOR.getMsg() + " " + tab.getTableName());
// check if a predicate is needed
// predicate is needed if either input pruning is not enough
// or if input pruning is not possible
// check if the sample columns are the same as the table bucket columns
boolean colsEqual = true;
if ((sampleExprs.size() != tabBucketCols.size()) && (sampleExprs.size() != 0)) {
colsEqual = false;
for (int i = 0; i < sampleExprs.size() && colsEqual; i++) {
boolean colFound = false;
for (int j = 0; j < tabBucketCols.size() && !colFound; j++) {
if (sampleExprs.get(i).getToken().getType() != HiveParser.TOK_TABLE_OR_COL) {
if (((ASTNode) sampleExprs.get(i).getChild(0)).getText().equalsIgnoreCase(tabBucketCols.get(j))) {
colFound = true;
colsEqual = (colsEqual && colFound);
// Check if input can be pruned
ts.setInputPruning((sampleExprs == null || sampleExprs.size() == 0 || colsEqual));
// check if input pruning is enough
if ((sampleExprs == null || sampleExprs.size() == 0 || colsEqual) && (num == den || (den % numBuckets == 0 || numBuckets % den == 0))) {
// input pruning is enough; add the filter for the optimizer to use it
// later"No need for sample filter");
ExprNodeDesc samplePredicate = genSamplePredicate(ts, tabBucketCols, colsEqual, alias, rwsch, qb.getMetaData(), null);
FilterDesc filterDesc = new FilterDesc(samplePredicate, true, new SampleDesc(ts.getNumerator(), ts.getDenominator(), tabBucketCols, true));
op = OperatorFactory.getAndMakeChild(filterDesc, new RowSchema(rwsch.getColumnInfos()), top);
} else {
// need to add filter
// create tableOp to be filterDesc and set as child to 'top'"Need sample filter");
ExprNodeDesc samplePredicate = genSamplePredicate(ts, tabBucketCols, colsEqual, alias, rwsch, qb.getMetaData(), null);
FilterDesc filterDesc = new FilterDesc(samplePredicate, true);
op = OperatorFactory.getAndMakeChild(filterDesc, new RowSchema(rwsch.getColumnInfos()), top);
} else {
boolean testMode = conf.getBoolVar(HiveConf.ConfVars.HIVETESTMODE);
if (testMode) {
String tabName = tab.getTableName();
// has the user explicitly asked not to sample this table
String unSampleTblList = conf.getVar(HiveConf.ConfVars.HIVETESTMODENOSAMPLE);
String[] unSampleTbls = unSampleTblList.split(",");
boolean unsample = false;
for (String unSampleTbl : unSampleTbls) {
if (tabName.equalsIgnoreCase(unSampleTbl)) {
unsample = true;
if (!unsample) {
int numBuckets = tab.getNumBuckets();
// If the input table is bucketed, choose the first bucket
if (numBuckets > 0) {
TableSample tsSample = new TableSample(1, numBuckets);
qb.getParseInfo().setTabSample(alias, tsSample);
ExprNodeDesc samplePred = genSamplePredicate(tsSample, tab.getBucketCols(), true, alias, rwsch, qb.getMetaData(), null);
FilterDesc filterDesc = new FilterDesc(samplePred, true, new SampleDesc(tsSample.getNumerator(), tsSample.getDenominator(), tab.getBucketCols(), true));
op = OperatorFactory.getAndMakeChild(filterDesc, new RowSchema(rwsch.getColumnInfos()), top);"No need for sample filter");
} else {
// The table is not bucketed, add a dummy filter :: rand()
int freq = conf.getIntVar(HiveConf.ConfVars.HIVETESTMODESAMPLEFREQ);
TableSample tsSample = new TableSample(1, freq);
qb.getParseInfo().setTabSample(alias, tsSample);"Need sample filter");
ExprNodeDesc randFunc = TypeCheckProcFactory.DefaultExprProcessor.getFuncExprNodeDesc("rand", new ExprNodeConstantDesc(Integer.valueOf(460476415)));
ExprNodeDesc samplePred = genSamplePredicate(tsSample, null, false, alias, rwsch, qb.getMetaData(), randFunc);
FilterDesc filterDesc = new FilterDesc(samplePred, true);
op = OperatorFactory.getAndMakeChild(filterDesc, new RowSchema(rwsch.getColumnInfos()), top);
Operator output = putOpInsertMap(op, rwsch);
if (LOG.isDebugEnabled()) {
LOG.debug("Created Table Plan for " + alias + " " + op.toString());
return output;
use of org.apache.hadoop.hive.ql.exec.TableScanOperator in project hive by apache.
the class SemanticAnalyzer method genPlan.
public Operator genPlan(QB qb, boolean skipAmbiguityCheck) throws SemanticException {
// First generate all the opInfos for the elements in the from clause
// Must be deterministic order map - see HIVE-8707
Map<String, Operator> aliasToOpInfo = new LinkedHashMap<String, Operator>();
// Recurse over the subqueries to fill the subquery part of the plan
for (String alias : qb.getSubqAliases()) {
QBExpr qbexpr = qb.getSubqForAlias(alias);
Operator<?> operator = genPlan(qb, qbexpr);
aliasToOpInfo.put(alias, operator);
if (qb.getViewToTabSchema().containsKey(alias)) {
// we set viewProjectToTableSchema so that we can leverage ColumnPruner.
if (operator instanceof LimitOperator) {
// If create view has LIMIT operator, this can happen
// Fetch parent operator
operator = operator.getParentOperators().get(0);
if (operator instanceof SelectOperator) {
if (this.viewProjectToTableSchema == null) {
this.viewProjectToTableSchema = new LinkedHashMap<>();
viewProjectToTableSchema.put((SelectOperator) operator, qb.getViewToTabSchema().get(alias));
} else {
throw new SemanticException("View " + alias + " is corresponding to " + operator.getType().name() + ", rather than a SelectOperator.");
// Recurse over all the source tables
for (String alias : qb.getTabAliases()) {
if (alias.equals(DUMMY_TABLE)) {
Operator op = genTablePlan(alias, qb);
aliasToOpInfo.put(alias, op);
if (aliasToOpInfo.isEmpty()) {
qb.getMetaData().setSrcForAlias(DUMMY_TABLE, getDummyTable());
TableScanOperator op = (TableScanOperator) genTablePlan(DUMMY_TABLE, qb);
aliasToOpInfo.put(DUMMY_TABLE, op);
Operator srcOpInfo = null;
Operator lastPTFOp = null;
if (queryProperties.hasPTF()) {
// After processing subqueries and source tables, process
// partitioned table functions
HashMap<ASTNode, PTFInvocationSpec> ptfNodeToSpec = qb.getPTFNodeToSpec();
if (ptfNodeToSpec != null) {
for (Entry<ASTNode, PTFInvocationSpec> entry : ptfNodeToSpec.entrySet()) {
ASTNode ast = entry.getKey();
PTFInvocationSpec spec = entry.getValue();
String inputAlias = spec.getQueryInputName();
Operator inOp = aliasToOpInfo.get(inputAlias);
if (inOp == null) {
throw new SemanticException(generateErrorMessage(ast, "Cannot resolve input Operator for PTF invocation"));
lastPTFOp = genPTFPlan(spec, inOp);
String ptfAlias = spec.getFunction().getAlias();
if (ptfAlias != null) {
aliasToOpInfo.put(ptfAlias, lastPTFOp);
// For all the source tables that have a lateral view, attach the
// appropriate operators to the TS
genLateralViewPlans(aliasToOpInfo, qb);
// process join
if (qb.getParseInfo().getJoinExpr() != null) {
ASTNode joinExpr = qb.getParseInfo().getJoinExpr();
if (joinExpr.getToken().getType() == HiveParser.TOK_UNIQUEJOIN) {
QBJoinTree joinTree = genUniqueJoinTree(qb, joinExpr, aliasToOpInfo);
} else {
QBJoinTree joinTree = genJoinTree(qb, joinExpr, aliasToOpInfo);
* if there is only one destination in Query try to push where predicates
* as Join conditions
Set<String> dests = qb.getParseInfo().getClauseNames();
if (dests.size() == 1 && joinTree.getNoOuterJoin()) {
String dest = dests.iterator().next();
ASTNode whereClause = qb.getParseInfo().getWhrForClause(dest);
if (whereClause != null) {
extractJoinCondsFromWhereClause(joinTree, qb, dest, (ASTNode) whereClause.getChild(0), aliasToOpInfo);
if (!disableJoinMerge) {
// if any filters are present in the join tree, push them on top of the
// table
pushJoinFilters(qb, qb.getQbJoinTree(), aliasToOpInfo);
srcOpInfo = genJoinPlan(qb, aliasToOpInfo);
} else {
// Now if there are more than 1 sources then we have a join case
// later we can extend this to the union all case as well
srcOpInfo = aliasToOpInfo.values().iterator().next();
// with ptfs, there maybe more (note for PTFChains:
// 1 ptf invocation may entail multiple PTF operators)
srcOpInfo = lastPTFOp != null ? lastPTFOp : srcOpInfo;
Operator bodyOpInfo = genBodyPlan(qb, srcOpInfo, aliasToOpInfo);
if (LOG.isDebugEnabled()) {
LOG.debug("Created Plan for Query Block " + qb.getId());
if (qb.getAlias() != null) {
rewriteRRForSubQ(qb.getAlias(), bodyOpInfo, skipAmbiguityCheck);
return bodyOpInfo;
use of org.apache.hadoop.hive.ql.exec.TableScanOperator in project hive by apache.
the class SetSparkReducerParallelism method process.
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procContext, Object... nodeOutputs) throws SemanticException {
OptimizeSparkProcContext context = (OptimizeSparkProcContext) procContext;
ReduceSinkOperator sink = (ReduceSinkOperator) nd;
ReduceSinkDesc desc = sink.getConf();
Set<ReduceSinkOperator> parentSinks = null;
int maxReducers = context.getConf().getIntVar(HiveConf.ConfVars.MAXREDUCERS);
int constantReducers = context.getConf().getIntVar(HiveConf.ConfVars.HADOOPNUMREDUCERS);
if (!useOpStats) {
parentSinks = OperatorUtils.findOperatorsUpstream(sink, ReduceSinkOperator.class);
if (!context.getVisitedReduceSinks().containsAll(parentSinks)) {
// We haven't processed all the parent sinks, and we need
// them to be done in order to compute the parallelism for this sink.
// In this case, skip. We should visit this again from another path.
LOG.debug("Skipping sink " + sink + " for now as we haven't seen all its parents.");
return false;
if (context.getVisitedReduceSinks().contains(sink)) {
// skip walking the children
LOG.debug("Already processed reduce sink: " + sink.getName());
return true;
if (needSetParallelism(sink, context.getConf())) {
if (constantReducers > 0) {"Parallelism for reduce sink " + sink + " set by user to " + constantReducers);
} else {
// If it's a FileSink to bucketed files, use the bucket count as the reducer number
FileSinkOperator fso = GenSparkUtils.getChildOperator(sink, FileSinkOperator.class);
if (fso != null) {
String bucketCount = fso.getConf().getTableInfo().getProperties().getProperty(hive_metastoreConstants.BUCKET_COUNT);
int numBuckets = bucketCount == null ? 0 : Integer.parseInt(bucketCount);
if (numBuckets > 0) {"Set parallelism for reduce sink " + sink + " to: " + numBuckets + " (buckets)");
return false;
if (useOpStats || parentSinks.isEmpty()) {
long numberOfBytes = 0;
if (useOpStats) {
// we need to add up all the estimates from the siblings of this reduce sink
for (Operator<? extends OperatorDesc> sibling : sink.getChildOperators().get(0).getParentOperators()) {
if (sibling.getStatistics() != null) {
numberOfBytes = StatsUtils.safeAdd(numberOfBytes, sibling.getStatistics().getDataSize());
if (LOG.isDebugEnabled()) {
LOG.debug("Sibling " + sibling + " has stats: " + sibling.getStatistics());
} else {
LOG.warn("No stats available from: " + sibling);
} else {
// we should use TS stats to infer parallelism
for (Operator<? extends OperatorDesc> sibling : sink.getChildOperators().get(0).getParentOperators()) {
Set<TableScanOperator> sources = OperatorUtils.findOperatorsUpstream(sibling, TableScanOperator.class);
for (TableScanOperator source : sources) {
if (source.getStatistics() != null) {
numberOfBytes = StatsUtils.safeAdd(numberOfBytes, source.getStatistics().getDataSize());
if (LOG.isDebugEnabled()) {
LOG.debug("Table source " + source + " has stats: " + source.getStatistics());
} else {
LOG.warn("No stats available from table source: " + source);
LOG.debug("Gathered stats for sink " + sink + ". Total size is " + numberOfBytes + " bytes.");
// Divide it by 2 so that we can have more reducers
long bytesPerReducer = context.getConf().getLongVar(HiveConf.ConfVars.BYTESPERREDUCER) / 2;
int numReducers = Utilities.estimateReducers(numberOfBytes, bytesPerReducer, maxReducers, false);
if (sparkMemoryAndCores != null && sparkMemoryAndCores.getFirst() > 0 && sparkMemoryAndCores.getSecond() > 0) {
// warn the user if bytes per reducer is much larger than memory per task
if ((double) sparkMemoryAndCores.getFirst() / bytesPerReducer < 0.5) {
LOG.warn("Average load of a reducer is much larger than its available memory. " + "Consider decreasing hive.exec.reducers.bytes.per.reducer");
// If there are more cores, use the number of cores
numReducers = Math.max(numReducers, sparkMemoryAndCores.getSecond());
numReducers = Math.min(numReducers, maxReducers);"Set parallelism for reduce sink " + sink + " to: " + numReducers + " (calculated)");
} else {
// Use the maximum parallelism from all parent reduce sinks
int numberOfReducers = 0;
for (ReduceSinkOperator parent : parentSinks) {
numberOfReducers = Math.max(numberOfReducers, parent.getConf().getNumReducers());
LOG.debug("Set parallelism for sink " + sink + " to " + numberOfReducers + " based on its parents");
final Collection<ExprNodeDesc.ExprNodeDescEqualityWrapper> keyCols = ExprNodeDesc.ExprNodeDescEqualityWrapper.transform(desc.getKeyCols());
final Collection<ExprNodeDesc.ExprNodeDescEqualityWrapper> partCols = ExprNodeDesc.ExprNodeDescEqualityWrapper.transform(desc.getPartitionCols());
if (keyCols != null && keyCols.equals(partCols)) {
} else {"Number of reducers for sink " + sink + " was already determined to be: " + desc.getNumReducers());
return false;