use of org.apache.hadoop.hive.ql.parse.PrunedPartitionList in project hive by apache.
the class GenMRTableScan1 method process.
/**
* Table Sink encountered.
* @param nd
* the table sink operator encountered
* @param opProcCtx
* context
*/
@Override
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx opProcCtx, Object... nodeOutputs) throws SemanticException {
TableScanOperator op = (TableScanOperator) nd;
GenMRProcContext ctx = (GenMRProcContext) opProcCtx;
ParseContext parseCtx = ctx.getParseCtx();
Class<? extends InputFormat> inputFormat = op.getConf().getTableMetadata().getInputFormatClass();
Map<Operator<? extends OperatorDesc>, GenMapRedCtx> mapCurrCtx = ctx.getMapCurrCtx();
// create a dummy MapReduce task
MapredWork currWork = GenMapRedUtils.getMapRedWork(parseCtx);
MapRedTask currTask = (MapRedTask) TaskFactory.get(currWork, parseCtx.getConf());
ctx.setCurrTask(currTask);
ctx.setCurrTopOp(op);
for (String alias : parseCtx.getTopOps().keySet()) {
Operator<? extends OperatorDesc> currOp = parseCtx.getTopOps().get(alias);
if (currOp == op) {
String currAliasId = alias;
ctx.setCurrAliasId(currAliasId);
mapCurrCtx.put(op, new GenMapRedCtx(currTask, currAliasId));
if (parseCtx.getQueryProperties().isAnalyzeCommand()) {
boolean partialScan = parseCtx.getQueryProperties().isPartialScanAnalyzeCommand();
boolean noScan = parseCtx.getQueryProperties().isNoScanAnalyzeCommand();
if (OrcInputFormat.class.isAssignableFrom(inputFormat) || MapredParquetInputFormat.class.isAssignableFrom(inputFormat)) {
// For ORC and Parquet, all the following statements are the same
// ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS
// ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS partialscan;
// ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS noscan;
// There will not be any MR or Tez job above this task
StatsNoJobWork snjWork = new StatsNoJobWork(op.getConf().getTableMetadata().getTableSpec());
snjWork.setStatsReliable(parseCtx.getConf().getBoolVar(HiveConf.ConfVars.HIVE_STATS_RELIABLE));
// If partition is specified, get pruned partition list
Set<Partition> confirmedParts = GenMapRedUtils.getConfirmedPartitionsForScan(op);
if (confirmedParts.size() > 0) {
Table source = op.getConf().getTableMetadata();
List<String> partCols = GenMapRedUtils.getPartitionColumns(op);
PrunedPartitionList partList = new PrunedPartitionList(source, confirmedParts, partCols, false);
snjWork.setPrunedPartitionList(partList);
}
Task<StatsNoJobWork> snjTask = TaskFactory.get(snjWork, parseCtx.getConf());
ctx.setCurrTask(snjTask);
ctx.setCurrTopOp(null);
ctx.getRootTasks().clear();
ctx.getRootTasks().add(snjTask);
} else {
// ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS;
// The plan consists of a simple MapRedTask followed by a StatsTask.
// The MR task is just a simple TableScanOperator
StatsWork statsWork = new StatsWork(op.getConf().getTableMetadata().getTableSpec());
statsWork.setAggKey(op.getConf().getStatsAggPrefix());
statsWork.setStatsTmpDir(op.getConf().getTmpStatsDir());
statsWork.setSourceTask(currTask);
statsWork.setStatsReliable(parseCtx.getConf().getBoolVar(HiveConf.ConfVars.HIVE_STATS_RELIABLE));
Task<StatsWork> statsTask = TaskFactory.get(statsWork, parseCtx.getConf());
currTask.addDependentTask(statsTask);
if (!ctx.getRootTasks().contains(currTask)) {
ctx.getRootTasks().add(currTask);
}
// The plan consists of a StatsTask only.
if (noScan) {
statsTask.setParentTasks(null);
statsWork.setNoScanAnalyzeCommand(true);
ctx.getRootTasks().remove(currTask);
ctx.getRootTasks().add(statsTask);
}
// ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS partialscan;
if (partialScan) {
handlePartialScanCommand(op, ctx, parseCtx, currTask, statsWork, statsTask);
}
currWork.getMapWork().setGatheringStats(true);
if (currWork.getReduceWork() != null) {
currWork.getReduceWork().setGatheringStats(true);
}
// NOTE: here we should use the new partition predicate pushdown API to get a list of
// pruned list,
// and pass it to setTaskPlan as the last parameter
Set<Partition> confirmedPartns = GenMapRedUtils.getConfirmedPartitionsForScan(op);
if (confirmedPartns.size() > 0) {
Table source = op.getConf().getTableMetadata();
List<String> partCols = GenMapRedUtils.getPartitionColumns(op);
PrunedPartitionList partList = new PrunedPartitionList(source, confirmedPartns, partCols, false);
GenMapRedUtils.setTaskPlan(currAliasId, op, currTask, false, ctx, partList);
} else {
// non-partitioned table
GenMapRedUtils.setTaskPlan(currAliasId, op, currTask, false, ctx);
}
}
}
return true;
}
}
assert false;
return null;
}
use of org.apache.hadoop.hive.ql.parse.PrunedPartitionList in project hive by apache.
the class SimpleFetchOptimizer method checkTree.
// all we can handle is LimitOperator, FilterOperator SelectOperator and final FS
//
// for non-aggressive mode (minimal)
// 1. sampling is not allowed
// 2. for partitioned table, all filters should be targeted to partition column
// 3. SelectOperator should use only simple cast/column access
private FetchData checkTree(boolean aggressive, ParseContext pctx, String alias, TableScanOperator ts) throws HiveException {
SplitSample splitSample = pctx.getNameToSplitSample().get(alias);
if (!aggressive && splitSample != null) {
return null;
}
if (!aggressive && ts.getConf().getTableSample() != null) {
return null;
}
Table table = ts.getConf().getTableMetadata();
if (table == null) {
return null;
}
ReadEntity parent = PlanUtils.getParentViewInfo(alias, pctx.getViewAliasToInput());
if (!table.isPartitioned()) {
FetchData fetch = new FetchData(ts, parent, table, splitSample);
return checkOperators(fetch, aggressive, false);
}
boolean bypassFilter = false;
if (HiveConf.getBoolVar(pctx.getConf(), HiveConf.ConfVars.HIVEOPTPPD)) {
ExprNodeDesc pruner = pctx.getOpToPartPruner().get(ts);
if (PartitionPruner.onlyContainsPartnCols(table, pruner)) {
bypassFilter = !pctx.getPrunedPartitions(alias, ts).hasUnknownPartitions();
}
}
if (!aggressive && !bypassFilter) {
return null;
}
PrunedPartitionList partitions = pctx.getPrunedPartitions(alias, ts);
FetchData fetch = new FetchData(ts, parent, table, partitions, splitSample, bypassFilter);
return checkOperators(fetch, aggressive, bypassFilter);
}
use of org.apache.hadoop.hive.ql.parse.PrunedPartitionList in project hive by apache.
the class PartitionPruner method getPartitionsFromServer.
private static PrunedPartitionList getPartitionsFromServer(Table tab, final ExprNodeGenericFuncDesc compactExpr, HiveConf conf, String alias, Set<String> partColsUsedInFilter, boolean isPruningByExactFilter) throws SemanticException {
try {
// Finally, check the filter for non-built-in UDFs. If these are present, we cannot
// do filtering on the server, and have to fall back to client path.
boolean doEvalClientSide = hasUserFunctions(compactExpr);
// Now filter.
List<Partition> partitions = new ArrayList<Partition>();
boolean hasUnknownPartitions = false;
PerfLogger perfLogger = SessionState.getPerfLogger();
if (!doEvalClientSide) {
perfLogger.PerfLogBegin(CLASS_NAME, PerfLogger.PARTITION_RETRIEVING);
try {
hasUnknownPartitions = Hive.get().getPartitionsByExpr(tab, compactExpr, conf, partitions);
} catch (IMetaStoreClient.IncompatibleMetastoreException ime) {
// TODO: backward compat for Hive <= 0.12. Can be removed later.
LOG.warn("Metastore doesn't support getPartitionsByExpr", ime);
doEvalClientSide = true;
} finally {
perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.PARTITION_RETRIEVING);
}
}
if (doEvalClientSide) {
// Either we have user functions, or metastore is old version - filter names locally.
hasUnknownPartitions = pruneBySequentialScan(tab, partitions, compactExpr, conf);
}
// metastore and so some partitions may have no data based on other filters.
return new PrunedPartitionList(tab, new LinkedHashSet<Partition>(partitions), new ArrayList<String>(partColsUsedInFilter), hasUnknownPartitions || !isPruningByExactFilter);
} catch (SemanticException e) {
throw e;
} catch (Exception e) {
throw new SemanticException(e);
}
}
use of org.apache.hadoop.hive.ql.parse.PrunedPartitionList in project hive by apache.
the class HiveMaterializedViewsRegistry method createTableScan.
private static RelNode createTableScan(Table viewTable) {
// 0. Recreate cluster
final RelOptPlanner planner = HiveVolcanoPlanner.createPlanner(null);
final RexBuilder rexBuilder = new RexBuilder(new JavaTypeFactoryImpl());
final RelOptCluster cluster = RelOptCluster.create(planner, rexBuilder);
// 1. Create column schema
final RowResolver rr = new RowResolver();
// 1.1 Add Column info for non partion cols (Object Inspector fields)
StructObjectInspector rowObjectInspector;
try {
rowObjectInspector = (StructObjectInspector) viewTable.getDeserializer().getObjectInspector();
} catch (SerDeException e) {
// Bail out
return null;
}
List<? extends StructField> fields = rowObjectInspector.getAllStructFieldRefs();
ColumnInfo colInfo;
String colName;
ArrayList<ColumnInfo> cInfoLst = new ArrayList<ColumnInfo>();
for (int i = 0; i < fields.size(); i++) {
colName = fields.get(i).getFieldName();
colInfo = new ColumnInfo(fields.get(i).getFieldName(), TypeInfoUtils.getTypeInfoFromObjectInspector(fields.get(i).getFieldObjectInspector()), null, false);
rr.put(null, colName, colInfo);
cInfoLst.add(colInfo);
}
ArrayList<ColumnInfo> nonPartitionColumns = new ArrayList<ColumnInfo>(cInfoLst);
// 1.2 Add column info corresponding to partition columns
ArrayList<ColumnInfo> partitionColumns = new ArrayList<ColumnInfo>();
for (FieldSchema part_col : viewTable.getPartCols()) {
colName = part_col.getName();
colInfo = new ColumnInfo(colName, TypeInfoFactory.getPrimitiveTypeInfo(part_col.getType()), null, true);
rr.put(null, colName, colInfo);
cInfoLst.add(colInfo);
partitionColumns.add(colInfo);
}
// 1.3 Build row type from field <type, name>
RelDataType rowType;
try {
rowType = TypeConverter.getType(cluster, rr, null);
} catch (CalciteSemanticException e) {
// Bail out
return null;
}
// 2. Build RelOptAbstractTable
String fullyQualifiedTabName = viewTable.getDbName();
if (fullyQualifiedTabName != null && !fullyQualifiedTabName.isEmpty()) {
fullyQualifiedTabName = fullyQualifiedTabName + "." + viewTable.getTableName();
} else {
fullyQualifiedTabName = viewTable.getTableName();
}
RelOptHiveTable optTable = new RelOptHiveTable(null, fullyQualifiedTabName, rowType, viewTable, nonPartitionColumns, partitionColumns, new ArrayList<VirtualColumn>(), SessionState.get().getConf(), new HashMap<String, PrunedPartitionList>(), new AtomicInteger());
RelNode tableRel;
// 3. Build operator
if (obtainTableType(viewTable) == TableType.DRUID) {
// Build Druid query
String address = HiveConf.getVar(SessionState.get().getConf(), HiveConf.ConfVars.HIVE_DRUID_BROKER_DEFAULT_ADDRESS);
String dataSource = viewTable.getParameters().get(Constants.DRUID_DATA_SOURCE);
Set<String> metrics = new HashSet<>();
List<RelDataType> druidColTypes = new ArrayList<>();
List<String> druidColNames = new ArrayList<>();
for (RelDataTypeField field : rowType.getFieldList()) {
druidColTypes.add(field.getType());
druidColNames.add(field.getName());
if (field.getName().equals(DruidTable.DEFAULT_TIMESTAMP_COLUMN)) {
// timestamp
continue;
}
if (field.getType().getSqlTypeName() == SqlTypeName.VARCHAR) {
// dimension
continue;
}
metrics.add(field.getName());
}
List<Interval> intervals = Arrays.asList(DruidTable.DEFAULT_INTERVAL);
DruidTable druidTable = new DruidTable(new DruidSchema(address, address, false), dataSource, RelDataTypeImpl.proto(rowType), metrics, DruidTable.DEFAULT_TIMESTAMP_COLUMN, intervals);
final TableScan scan = new HiveTableScan(cluster, cluster.traitSetOf(HiveRelNode.CONVENTION), optTable, viewTable.getTableName(), null, false, false);
tableRel = DruidQuery.create(cluster, cluster.traitSetOf(HiveRelNode.CONVENTION), optTable, druidTable, ImmutableList.<RelNode>of(scan));
} else {
// Build Hive Table Scan Rel
tableRel = new HiveTableScan(cluster, cluster.traitSetOf(HiveRelNode.CONVENTION), optTable, viewTable.getTableName(), null, false, false);
}
return tableRel;
}
use of org.apache.hadoop.hive.ql.parse.PrunedPartitionList in project hive by apache.
the class AbstractBucketJoinProc method checkConvertBucketMapJoin.
/*
* Can this mapjoin be converted to a bucketed mapjoin ?
* The following checks are performed:
* a. The join columns contains all the bucket columns.
* b. The join keys are not transformed in the sub-query.
* c. All partitions contain the expected number of files (number of buckets).
* d. The number of buckets in the big table can be divided by no of buckets in small tables.
*/
protected boolean checkConvertBucketMapJoin(BucketJoinProcCtx context, Map<String, Operator<? extends OperatorDesc>> aliasToOpInfo, Map<Byte, List<ExprNodeDesc>> keysMap, String baseBigAlias, List<String> joinAliases) throws SemanticException {
LinkedHashMap<String, List<Integer>> tblAliasToNumberOfBucketsInEachPartition = new LinkedHashMap<String, List<Integer>>();
LinkedHashMap<String, List<List<String>>> tblAliasToBucketedFilePathsInEachPartition = new LinkedHashMap<String, List<List<String>>>();
HashMap<String, TableScanOperator> topOps = pGraphContext.getTopOps();
HashMap<String, String> aliasToNewAliasMap = new HashMap<String, String>();
// (partition to bucket file names) and (partition to bucket number) for
// the big table;
LinkedHashMap<Partition, List<String>> bigTblPartsToBucketFileNames = new LinkedHashMap<Partition, List<String>>();
LinkedHashMap<Partition, Integer> bigTblPartsToBucketNumber = new LinkedHashMap<Partition, Integer>();
// accessing order of join cols to bucket cols, should be same
Integer[] joinKeyOrder = null;
boolean bigTablePartitioned = true;
for (int index = 0; index < joinAliases.size(); index++) {
String alias = joinAliases.get(index);
Operator<? extends OperatorDesc> topOp = aliasToOpInfo.get(alias);
// The alias may not be present in case of a sub-query
if (topOp == null) {
return false;
}
List<String> keys = toColumns(keysMap.get((byte) index));
if (keys == null || keys.isEmpty()) {
return false;
}
int oldKeySize = keys.size();
TableScanOperator tso = TableAccessAnalyzer.genRootTableScan(topOp, keys);
if (tso == null) {
// between topOp and root TableScan operator. We don't handle that case, and simply return
return false;
}
// For nested sub-queries, the alias mapping is not maintained in QB currently.
if (topOps.containsValue(tso)) {
for (Map.Entry<String, TableScanOperator> topOpEntry : topOps.entrySet()) {
if (topOpEntry.getValue() == tso) {
String newAlias = topOpEntry.getKey();
if (!newAlias.equals(alias)) {
joinAliases.set(index, newAlias);
if (baseBigAlias.equals(alias)) {
baseBigAlias = newAlias;
}
aliasToNewAliasMap.put(alias, newAlias);
alias = newAlias;
}
break;
}
}
} else {
// Ideally, this should never happen, and this should be an assert.
return false;
}
// be removed, and the size before and after the genRootTableScan will be different.
if (keys.size() != oldKeySize) {
return false;
}
if (joinKeyOrder == null) {
joinKeyOrder = new Integer[keys.size()];
}
Table tbl = tso.getConf().getTableMetadata();
if (tbl.isPartitioned()) {
PrunedPartitionList prunedParts = pGraphContext.getPrunedPartitions(alias, tso);
List<Partition> partitions = prunedParts.getNotDeniedPartns();
// construct a mapping of (Partition->bucket file names) and (Partition -> bucket number)
if (partitions.isEmpty()) {
if (!alias.equals(baseBigAlias)) {
tblAliasToNumberOfBucketsInEachPartition.put(alias, Arrays.<Integer>asList());
tblAliasToBucketedFilePathsInEachPartition.put(alias, new ArrayList<List<String>>());
}
} else {
List<Integer> buckets = new ArrayList<Integer>();
List<List<String>> files = new ArrayList<List<String>>();
for (Partition p : partitions) {
if (!checkBucketColumns(p.getBucketCols(), keys, joinKeyOrder)) {
return false;
}
List<String> fileNames = getBucketFilePathsOfPartition(p.getDataLocation(), pGraphContext);
// The number of files for the table should be same as number of buckets.
int bucketCount = p.getBucketCount();
if (fileNames.size() != 0 && fileNames.size() != bucketCount) {
String msg = "The number of buckets for table " + tbl.getTableName() + " partition " + p.getName() + " is " + p.getBucketCount() + ", whereas the number of files is " + fileNames.size();
throw new SemanticException(ErrorMsg.BUCKETED_TABLE_METADATA_INCORRECT.getMsg(msg));
}
if (alias.equals(baseBigAlias)) {
bigTblPartsToBucketFileNames.put(p, fileNames);
bigTblPartsToBucketNumber.put(p, bucketCount);
} else {
files.add(fileNames);
buckets.add(bucketCount);
}
}
if (!alias.equals(baseBigAlias)) {
tblAliasToNumberOfBucketsInEachPartition.put(alias, buckets);
tblAliasToBucketedFilePathsInEachPartition.put(alias, files);
}
}
} else {
if (!checkBucketColumns(tbl.getBucketCols(), keys, joinKeyOrder)) {
return false;
}
List<String> fileNames = getBucketFilePathsOfPartition(tbl.getDataLocation(), pGraphContext);
Integer num = new Integer(tbl.getNumBuckets());
// The number of files for the table should be same as number of buckets.
if (fileNames.size() != 0 && fileNames.size() != num) {
String msg = "The number of buckets for table " + tbl.getTableName() + " is " + tbl.getNumBuckets() + ", whereas the number of files is " + fileNames.size();
throw new SemanticException(ErrorMsg.BUCKETED_TABLE_METADATA_INCORRECT.getMsg(msg));
}
if (alias.equals(baseBigAlias)) {
bigTblPartsToBucketFileNames.put(null, fileNames);
bigTblPartsToBucketNumber.put(null, tbl.getNumBuckets());
bigTablePartitioned = false;
} else {
tblAliasToNumberOfBucketsInEachPartition.put(alias, Arrays.asList(num));
tblAliasToBucketedFilePathsInEachPartition.put(alias, Arrays.asList(fileNames));
}
}
}
// the big table can be divided by no of buckets in small tables.
for (Integer numBucketsInPartitionOfBigTable : bigTblPartsToBucketNumber.values()) {
if (!checkNumberOfBucketsAgainstBigTable(tblAliasToNumberOfBucketsInEachPartition, numBucketsInPartitionOfBigTable)) {
return false;
}
}
context.setTblAliasToNumberOfBucketsInEachPartition(tblAliasToNumberOfBucketsInEachPartition);
context.setTblAliasToBucketedFilePathsInEachPartition(tblAliasToBucketedFilePathsInEachPartition);
context.setBigTblPartsToBucketFileNames(bigTblPartsToBucketFileNames);
context.setBigTblPartsToBucketNumber(bigTblPartsToBucketNumber);
context.setJoinAliases(joinAliases);
context.setBaseBigAlias(baseBigAlias);
context.setBigTablePartitioned(bigTablePartitioned);
if (!aliasToNewAliasMap.isEmpty()) {
context.setAliasToNewAliasMap(aliasToNewAliasMap);
}
return true;
}
Aggregations