use of org.apache.hadoop.hive.ql.exec.TableScanOperator in project hive by apache.
the class DriverTxnHandler method setValidWriteIds.
private void setValidWriteIds(ValidTxnWriteIdList txnWriteIds) {
driverContext.getConf().set(ValidTxnWriteIdList.VALID_TABLES_WRITEIDS_KEY, txnWriteIds.toString());
if (driverContext.getPlan().getFetchTask() != null) {
// This is needed for {@link HiveConf.ConfVars.HIVEFETCHTASKCONVERSION} optimization which initializes JobConf
// in FetchOperator before recordValidTxns() but this has to be done after locks are acquired to avoid race
// conditions in ACID. This case is supported only for single source query.
Operator<?> source = driverContext.getPlan().getFetchTask().getWork().getSource();
if (source instanceof TableScanOperator) {
TableScanOperator tsOp = (TableScanOperator) source;
String fullTableName = AcidUtils.getFullTableName(tsOp.getConf().getDatabaseName(), tsOp.getConf().getTableName());
ValidWriteIdList writeIdList = txnWriteIds.getTableValidWriteIdList(fullTableName);
if (tsOp.getConf().isTranscationalTable() && (writeIdList == null)) {
throw new IllegalStateException(String.format("ACID table: %s is missing from the ValidWriteIdList config: %s", fullTableName, txnWriteIds.toString()));
if (writeIdList != null) {
use of org.apache.hadoop.hive.ql.exec.TableScanOperator in project hive by apache.
the class SparkPlanGenerator method generateMapInput.
private MapInput generateMapInput(SparkPlan sparkPlan, MapWork mapWork) throws Exception {
JobConf jobConf = cloneJobConf(mapWork);
Class ifClass = getInputFormat(jobConf, mapWork);, ""));
JavaPairRDD<WritableComparable, Writable> hadoopRDD;
if (mapWork.getNumMapTasks() != null) {
hadoopRDD = sc.hadoopRDD(jobConf, ifClass, WritableComparable.class, Writable.class, mapWork.getNumMapTasks());
} else {
hadoopRDD = sc.hadoopRDD(jobConf, ifClass, WritableComparable.class, Writable.class);
boolean toCache = false;
String tables = mapWork.getAllRootOperators().stream().filter(op -> op instanceof TableScanOperator).map(ts -> ((TableScanDesc) ts.getConf()).getAlias()).collect(Collectors.joining(", "));
String rddName = mapWork.getName() + " (" + tables + ", " + hadoopRDD.getNumPartitions() + (toCache ? ", cached)" : ")");
// Caching is disabled for MapInput due to HIVE-8920
MapInput result = new MapInput(sparkPlan, hadoopRDD, toCache, rddName, mapWork);
return result;
use of org.apache.hadoop.hive.ql.exec.TableScanOperator in project hive by apache.
the class ProjectionPusher method pushProjectionsAndFilters.
private void pushProjectionsAndFilters(final JobConf jobConf, final String splitPath, final String splitPathWithNoSchema) {
if (mapWork == null) {
} else if (mapWork.getPathToAliases() == null) {
final Set<String> aliases = new HashSet<String>();
try {
List<String> a = HiveFileFormatUtils.getFromPathRecursively(mapWork.getPathToAliases(), new Path(splitPath), null, false, true);
if (a != null) {
if (a == null || a.isEmpty()) {
// TODO: not having aliases for path usually means some bug. Should it give up?
LOG.warn("Couldn't find aliases for " + splitPath);
} catch (IllegalArgumentException | IOException e) {
throw new RuntimeException(e);
// Collect the needed columns from all the aliases and create ORed filter
// expression for the table.
boolean allColumnsNeeded = false;
boolean noFilters = false;
Set<Integer> neededColumnIDs = new HashSet<Integer>();
// To support nested column pruning, we need to track the path from the top to the nested
// fields
Set<String> neededNestedColumnPaths = new HashSet<String>();
List<ExprNodeGenericFuncDesc> filterExprs = new ArrayList<ExprNodeGenericFuncDesc>();
RowSchema rowSchema = null;
for (String alias : aliases) {
final Operator<? extends Serializable> op = mapWork.getAliasToWork().get(alias);
if (op != null && op instanceof TableScanOperator) {
final TableScanOperator ts = (TableScanOperator) op;
if (ts.getNeededColumnIDs() == null) {
allColumnsNeeded = true;
} else {
if (ts.getNeededNestedColumnPaths() != null) {
rowSchema = ts.getSchema();
ExprNodeGenericFuncDesc filterExpr = ts.getConf() == null ? null : ts.getConf().getFilterExpr();
// No filter if any TS has no filter expression
noFilters = filterExpr == null;
ExprNodeGenericFuncDesc tableFilterExpr = null;
if (!noFilters) {
try {
for (ExprNodeGenericFuncDesc filterExpr : filterExprs) {
if (tableFilterExpr == null) {
tableFilterExpr = filterExpr;
} else {
tableFilterExpr = ExprNodeGenericFuncDesc.newInstance(new GenericUDFOPOr(), Arrays.<ExprNodeDesc>asList(tableFilterExpr, filterExpr));
} catch (UDFArgumentException ex) {
LOG.debug("Turn off filtering due to " + ex);
tableFilterExpr = null;
// push down projections
if (!allColumnsNeeded) {
if (!neededColumnIDs.isEmpty()) {
ColumnProjectionUtils.appendReadColumns(jobConf, new ArrayList<Integer>(neededColumnIDs));
ColumnProjectionUtils.appendNestedColumnPaths(jobConf, new ArrayList<String>(neededNestedColumnPaths));
} else {
pushFilters(jobConf, rowSchema, tableFilterExpr);
use of org.apache.hadoop.hive.ql.exec.TableScanOperator in project hive by apache.
the class ConvertJoinMapJoin method removeCycleCreatingSemiJoinOps.
// Remove any semijoin branch associated with hashjoin's parent's operator
// pipeline which can cause a cycle after hashjoin optimization.
private void removeCycleCreatingSemiJoinOps(MapJoinOperator mapjoinOp, Operator<?> parentSelectOpOfBigTable, ParseContext parseContext) throws SemanticException {
Map<ReduceSinkOperator, TableScanOperator> semiJoinMap = new HashMap<ReduceSinkOperator, TableScanOperator>();
for (Operator<?> op : parentSelectOpOfBigTable.getChildOperators()) {
if (!(op instanceof SelectOperator)) {
while (op.getChildOperators().size() > 0) {
op = op.getChildOperators().get(0);
// If not ReduceSink Op, skip
if (!(op instanceof ReduceSinkOperator)) {
ReduceSinkOperator rs = (ReduceSinkOperator) op;
TableScanOperator ts = parseContext.getRsToSemiJoinBranchInfo().get(rs).getTsOp();
if (ts == null) {
// skip, no semijoin branch
// Found a semijoin branch.
// There can be more than one semijoin branch coming from the parent
// GBY Operator of the RS Operator.
Operator<?> parentGB = op.getParentOperators().get(0);
for (Operator<?> childRS : parentGB.getChildOperators()) {
// Get the RS and TS for this branch
rs = (ReduceSinkOperator) childRS;
ts = parseContext.getRsToSemiJoinBranchInfo().get(rs).getTsOp();
assert ts != null;
for (Operator<?> parent : mapjoinOp.getParentOperators()) {
if (!(parent instanceof ReduceSinkOperator)) {
Set<TableScanOperator> tsOps = OperatorUtils.findOperatorsUpstream(parent, TableScanOperator.class);
boolean found = false;
for (TableScanOperator parentTS : tsOps) {
// If the parent is same as the ts, then we have a cycle.
if (ts == parentTS) {
semiJoinMap.put(rs, ts);
found = true;
if (found) {
if (semiJoinMap.size() > 0) {
for (ReduceSinkOperator rs : semiJoinMap.keySet()) {
if (LOG.isDebugEnabled()) {
LOG.debug("Found semijoin optimization from the big table side of a map join, which will cause a task cycle. " + "Removing semijoin " + OperatorUtils.getOpNamePretty(rs) + " - " + OperatorUtils.getOpNamePretty(semiJoinMap.get(rs)));
GenTezUtils.removeSemiJoinOperator(parseContext, rs, semiJoinMap.get(rs));
use of org.apache.hadoop.hive.ql.exec.TableScanOperator in project hive by apache.
the class DynamicPartitionPruningOptimization method generateSemiJoinOperatorPlan.
// Generates plan for min/max when dynamic partition pruning is ruled out.
private boolean generateSemiJoinOperatorPlan(DynamicListContext ctx, ParseContext parseContext, TableScanOperator ts, String keyBaseAlias, String internalColName, String colName, SemiJoinHint sjHint) throws SemanticException {
// we will put a fork in the plan at the source of the reduce sink
Operator<? extends OperatorDesc> parentOfRS = ctx.generator.getParentOperators().get(0);
// we need the expr that generated the key of the reduce sink
ExprNodeDesc key = ctx.getKeyCol();
assert colName != null;
// Fetch the TableScan Operator.
Operator<?> op = parentOfRS;
while (!(op == null || op instanceof TableScanOperator || op instanceof ReduceSinkOperator)) {
op = op.getParentOperators().get(0);
if (op instanceof TableScanOperator) {
Table table = ((TableScanOperator) op).getConf().getTableMetadata();
if (table.isPartitionKey(colName)) {
// The column is partition column, skip the optimization.
return false;
// Check if there already exists a semijoin branch
GroupByOperator gb = parseContext.getColExprToGBMap().get(key);
if (gb != null) {
// Already an existing semijoin branch, reuse it
createFinalRsForSemiJoinOp(parseContext, ts, gb, key, keyBaseAlias, ctx.parent.getChildren().get(0), sjHint != null);
// done!
return true;
List<ExprNodeDesc> keyExprs = new ArrayList<ExprNodeDesc>();
// group by requires "ArrayList", don't ask.
ArrayList<String> outputNames = new ArrayList<String>();
// project the relevant key column
SelectDesc select = new SelectDesc(keyExprs, outputNames);
// Create the new RowSchema for the projected column
ColumnInfo columnInfo = parentOfRS.getSchema().getColumnInfo(internalColName);
columnInfo = new ColumnInfo(columnInfo);
ArrayList<ColumnInfo> signature = new ArrayList<ColumnInfo>();
RowSchema rowSchema = new RowSchema(signature);
// Create the column expr map
Map<String, ExprNodeDesc> colExprMap = new HashMap<String, ExprNodeDesc>();
ExprNodeDesc exprNode = null;
if (columnInfo == null) {
LOG.debug("No ColumnInfo found in {} for {}", parentOfRS.getOperatorId(), internalColName);
return false;
exprNode = new ExprNodeColumnDesc(columnInfo);
colExprMap.put(internalColName, exprNode);
// Create the Select Operator
SelectOperator selectOp = (SelectOperator) OperatorFactory.getAndMakeChild(select, rowSchema, colExprMap, parentOfRS);
// do a group by to aggregate min,max and bloom filter.
float groupByMemoryUsage = HiveConf.getFloatVar(parseContext.getConf(), HiveConf.ConfVars.HIVEMAPAGGRHASHMEMORY);
float memoryThreshold = HiveConf.getFloatVar(parseContext.getConf(), HiveConf.ConfVars.HIVEMAPAGGRMEMORYTHRESHOLD);
float minReductionHashAggr = HiveConf.getFloatVar(parseContext.getConf(), ConfVars.HIVEMAPAGGRHASHMINREDUCTION);
float minReductionHashAggrLowerBound = HiveConf.getFloatVar(parseContext.getConf(), ConfVars.HIVEMAPAGGRHASHMINREDUCTIONLOWERBOUND);
// Add min/max and bloom filter aggregations
List<ObjectInspector> aggFnOIs = new ArrayList<ObjectInspector>();
ArrayList<ExprNodeDesc> params = new ArrayList<ExprNodeDesc>();
params.add(new ExprNodeColumnDesc(key.getTypeInfo(), outputNames.get(0), "", false));
ArrayList<AggregationDesc> aggs = new ArrayList<AggregationDesc>();
try {
AggregationDesc min = new AggregationDesc("min", FunctionRegistry.getGenericUDAFEvaluator("min", aggFnOIs, false, false), params, false, Mode.PARTIAL1);
AggregationDesc max = new AggregationDesc("max", FunctionRegistry.getGenericUDAFEvaluator("max", aggFnOIs, false, false), params, false, Mode.PARTIAL1);
AggregationDesc bloomFilter = new AggregationDesc("bloom_filter", FunctionRegistry.getGenericUDAFEvaluator("bloom_filter", aggFnOIs, false, false), params, false, Mode.PARTIAL1);
GenericUDAFBloomFilterEvaluator bloomFilterEval = (GenericUDAFBloomFilterEvaluator) bloomFilter.getGenericUDAFEvaluator();
if (sjHint != null && sjHint.getNumEntries() > 0) {
LOG.debug("Setting size for " + keyBaseAlias + " to " + sjHint.getNumEntries() + " based on the hint");
} catch (SemanticException e) {
LOG.error("Error creating min/max aggregations on key", e);
throw new IllegalStateException("Error creating min/max aggregations on key", e);
// Create the Group by Operator
ArrayList<String> gbOutputNames = new ArrayList<String>();
GroupByDesc groupBy = new GroupByDesc(GroupByDesc.Mode.HASH, gbOutputNames, new ArrayList<ExprNodeDesc>(), aggs, false, groupByMemoryUsage, memoryThreshold, minReductionHashAggr, minReductionHashAggrLowerBound, null, false, -1, false);
ArrayList<ColumnInfo> groupbyColInfos = new ArrayList<ColumnInfo>();
groupbyColInfos.add(new ColumnInfo(gbOutputNames.get(0), key.getTypeInfo(), "", false));
groupbyColInfos.add(new ColumnInfo(gbOutputNames.get(1), key.getTypeInfo(), "", false));
groupbyColInfos.add(new ColumnInfo(gbOutputNames.get(2), key.getTypeInfo(), "", false));
GroupByOperator groupByOp = (GroupByOperator) OperatorFactory.getAndMakeChild(groupBy, new RowSchema(groupbyColInfos), selectOp);
groupByOp.setColumnExprMap(new HashMap<String, ExprNodeDesc>());
// Get the column names of the aggregations for reduce sink
int colPos = 0;
ArrayList<ExprNodeDesc> rsValueCols = new ArrayList<ExprNodeDesc>();
Map<String, ExprNodeDesc> columnExprMap = new HashMap<String, ExprNodeDesc>();
for (int i = 0; i < aggs.size() - 1; i++) {
ExprNodeColumnDesc colExpr = new ExprNodeColumnDesc(key.getTypeInfo(), gbOutputNames.get(colPos), "", false);
columnExprMap.put(gbOutputNames.get(colPos), colExpr);
// Bloom Filter uses binary
ExprNodeColumnDesc colExpr = new ExprNodeColumnDesc(TypeInfoFactory.binaryTypeInfo, gbOutputNames.get(colPos), "", false);
columnExprMap.put(gbOutputNames.get(colPos), colExpr);
// Create the reduce sink operator
ReduceSinkDesc rsDesc = PlanUtils.getReduceSinkDesc(new ArrayList<ExprNodeDesc>(), rsValueCols, gbOutputNames, false, -1, 0, 1, Operation.NOT_ACID, NullOrdering.defaultNullOrder(parseContext.getConf()));
ReduceSinkOperator rsOp = (ReduceSinkOperator) OperatorFactory.getAndMakeChild(rsDesc, new RowSchema(groupByOp.getSchema()), groupByOp);
// Create the final Group By Operator
ArrayList<AggregationDesc> aggsFinal = new ArrayList<AggregationDesc>();
try {
List<ObjectInspector> minFinalFnOIs = new ArrayList<ObjectInspector>();
List<ObjectInspector> maxFinalFnOIs = new ArrayList<ObjectInspector>();
List<ObjectInspector> bloomFilterFinalFnOIs = new ArrayList<ObjectInspector>();
ArrayList<ExprNodeDesc> minFinalParams = new ArrayList<ExprNodeDesc>();
ArrayList<ExprNodeDesc> maxFinalParams = new ArrayList<ExprNodeDesc>();
ArrayList<ExprNodeDesc> bloomFilterFinalParams = new ArrayList<ExprNodeDesc>();
// Use the expressions from Reduce Sink.
// Coming from a ReduceSink the aggregations would be in the form VALUE._col0, VALUE._col1
minFinalParams.add(new ExprNodeColumnDesc(rsValueCols.get(0).getTypeInfo(), Utilities.ReduceField.VALUE + "." + gbOutputNames.get(0), "", false));
maxFinalParams.add(new ExprNodeColumnDesc(rsValueCols.get(1).getTypeInfo(), Utilities.ReduceField.VALUE + "." + gbOutputNames.get(1), "", false));
bloomFilterFinalParams.add(new ExprNodeColumnDesc(rsValueCols.get(2).getTypeInfo(), Utilities.ReduceField.VALUE + "." + gbOutputNames.get(2), "", false));
AggregationDesc min = new AggregationDesc("min", FunctionRegistry.getGenericUDAFEvaluator("min", minFinalFnOIs, false, false), minFinalParams, false, Mode.FINAL);
AggregationDesc max = new AggregationDesc("max", FunctionRegistry.getGenericUDAFEvaluator("max", maxFinalFnOIs, false, false), maxFinalParams, false, Mode.FINAL);
AggregationDesc bloomFilter = new AggregationDesc("bloom_filter", FunctionRegistry.getGenericUDAFEvaluator("bloom_filter", bloomFilterFinalFnOIs, false, false), bloomFilterFinalParams, false, Mode.FINAL);
GenericUDAFBloomFilterEvaluator bloomFilterEval = (GenericUDAFBloomFilterEvaluator) bloomFilter.getGenericUDAFEvaluator();
if (sjHint != null && sjHint.getNumEntries() > 0) {
} catch (SemanticException e) {
LOG.error("Error creating min/max aggregations on key", e);
throw new IllegalStateException("Error creating min/max aggregations on key", e);
GroupByDesc groupByDescFinal = new GroupByDesc(GroupByDesc.Mode.FINAL, gbOutputNames, new ArrayList<ExprNodeDesc>(), aggsFinal, false, groupByMemoryUsage, memoryThreshold, minReductionHashAggr, minReductionHashAggrLowerBound, null, false, 0, false);
GroupByOperator groupByOpFinal = (GroupByOperator) OperatorFactory.getAndMakeChild(groupByDescFinal, new RowSchema(rsOp.getSchema()), rsOp);
groupByOpFinal.setColumnExprMap(new HashMap<String, ExprNodeDesc>());
createFinalRsForSemiJoinOp(parseContext, ts, groupByOpFinal, key, keyBaseAlias, ctx.parent.getChildren().get(0), sjHint != null);
return true;