use of org.apache.hadoop.hive.ql.exec.TableScanOperator in project hive by apache.
the class GenMapRedUtils method createTemporaryTableScanOperator.
public static TableScanOperator createTemporaryTableScanOperator(CompilationOpContext ctx, RowSchema rowSchema) {
TableScanOperator tableScanOp = (TableScanOperator) OperatorFactory.get(ctx, new TableScanDesc(null), rowSchema);
// Set needed columns for this dummy TableScanOperator
List<Integer> neededColumnIds = new ArrayList<Integer>();
List<String> neededColumnNames = new ArrayList<String>();
List<ColumnInfo> parentColumnInfos = rowSchema.getSignature();
for (int i = 0; i < parentColumnInfos.size(); i++) {
neededColumnIds.add(i);
neededColumnNames.add(parentColumnInfos.get(i).getInternalName());
}
tableScanOp.setNeededColumnIDs(neededColumnIds);
tableScanOp.setNeededColumns(neededColumnNames);
tableScanOp.setReferencedColumns(neededColumnNames);
return tableScanOp;
}
use of org.apache.hadoop.hive.ql.exec.TableScanOperator in project hive by apache.
the class GenMRFileSink1 method processLinkedFileDesc.
/*
* Multiple file sink descriptors are linked.
* Use the task created by the first linked file descriptor
*/
private void processLinkedFileDesc(GenMRProcContext ctx, Task<? extends Serializable> childTask) throws SemanticException {
Task<? extends Serializable> currTask = ctx.getCurrTask();
TableScanOperator currTopOp = ctx.getCurrTopOp();
if (currTopOp != null && !ctx.isSeenOp(currTask, currTopOp)) {
String currAliasId = ctx.getCurrAliasId();
GenMapRedUtils.setTaskPlan(currAliasId, currTopOp, currTask, false, ctx);
}
if (childTask != null) {
currTask.addDependentTask(childTask);
}
}
use of org.apache.hadoop.hive.ql.exec.TableScanOperator in project hive by apache.
the class SimpleFetchOptimizer method transform.
@Override
public ParseContext transform(ParseContext pctx) throws SemanticException {
Map<String, TableScanOperator> topOps = pctx.getTopOps();
if (pctx.getQueryProperties().isQuery() && !pctx.getQueryProperties().isAnalyzeCommand() && topOps.size() == 1) {
// no join, no groupby, no distinct, no lateral view, no subq,
// no CTAS or insert, not analyze command, and single sourced.
String alias = (String) pctx.getTopOps().keySet().toArray()[0];
TableScanOperator topOp = pctx.getTopOps().values().iterator().next();
try {
FetchTask fetchTask = optimize(pctx, alias, topOp);
if (fetchTask != null) {
pctx.setFetchTask(fetchTask);
}
} catch (Exception e) {
// Has to use full name to make sure it does not conflict with
// org.apache.commons.lang.StringUtils
LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
if (e instanceof SemanticException) {
throw (SemanticException) e;
}
throw new SemanticException(e.getMessage(), e);
}
}
return pctx;
}
use of org.apache.hadoop.hive.ql.exec.TableScanOperator in project hive by apache.
the class AbstractBucketJoinProc method checkConvertBucketMapJoin.
/*
* Can this mapjoin be converted to a bucketed mapjoin ?
* The following checks are performed:
* a. The join columns contains all the bucket columns.
* b. The join keys are not transformed in the sub-query.
* c. All partitions contain the expected number of files (number of buckets).
* d. The number of buckets in the big table can be divided by no of buckets in small tables.
*/
protected boolean checkConvertBucketMapJoin(BucketJoinProcCtx context, Map<String, Operator<? extends OperatorDesc>> aliasToOpInfo, Map<Byte, List<ExprNodeDesc>> keysMap, String baseBigAlias, List<String> joinAliases) throws SemanticException {
LinkedHashMap<String, List<Integer>> tblAliasToNumberOfBucketsInEachPartition = new LinkedHashMap<String, List<Integer>>();
LinkedHashMap<String, List<List<String>>> tblAliasToBucketedFilePathsInEachPartition = new LinkedHashMap<String, List<List<String>>>();
HashMap<String, TableScanOperator> topOps = pGraphContext.getTopOps();
HashMap<String, String> aliasToNewAliasMap = new HashMap<String, String>();
// (partition to bucket file names) and (partition to bucket number) for
// the big table;
LinkedHashMap<Partition, List<String>> bigTblPartsToBucketFileNames = new LinkedHashMap<Partition, List<String>>();
LinkedHashMap<Partition, Integer> bigTblPartsToBucketNumber = new LinkedHashMap<Partition, Integer>();
// accessing order of join cols to bucket cols, should be same
Integer[] joinKeyOrder = null;
boolean bigTablePartitioned = true;
for (int index = 0; index < joinAliases.size(); index++) {
String alias = joinAliases.get(index);
Operator<? extends OperatorDesc> topOp = aliasToOpInfo.get(alias);
// The alias may not be present in case of a sub-query
if (topOp == null) {
return false;
}
List<String> keys = toColumns(keysMap.get((byte) index));
if (keys == null || keys.isEmpty()) {
return false;
}
int oldKeySize = keys.size();
TableScanOperator tso = TableAccessAnalyzer.genRootTableScan(topOp, keys);
if (tso == null) {
// between topOp and root TableScan operator. We don't handle that case, and simply return
return false;
}
// For nested sub-queries, the alias mapping is not maintained in QB currently.
if (topOps.containsValue(tso)) {
for (Map.Entry<String, TableScanOperator> topOpEntry : topOps.entrySet()) {
if (topOpEntry.getValue() == tso) {
String newAlias = topOpEntry.getKey();
if (!newAlias.equals(alias)) {
joinAliases.set(index, newAlias);
if (baseBigAlias.equals(alias)) {
baseBigAlias = newAlias;
}
aliasToNewAliasMap.put(alias, newAlias);
alias = newAlias;
}
break;
}
}
} else {
// Ideally, this should never happen, and this should be an assert.
return false;
}
// be removed, and the size before and after the genRootTableScan will be different.
if (keys.size() != oldKeySize) {
return false;
}
if (joinKeyOrder == null) {
joinKeyOrder = new Integer[keys.size()];
}
Table tbl = tso.getConf().getTableMetadata();
if (AcidUtils.isInsertOnlyTable(tbl.getParameters())) {
Utilities.FILE_OP_LOGGER.debug("No bucketed join on MM table " + tbl.getTableName());
return false;
}
if (tbl.isPartitioned()) {
PrunedPartitionList prunedParts = pGraphContext.getPrunedPartitions(alias, tso);
List<Partition> partitions = prunedParts.getNotDeniedPartns();
// construct a mapping of (Partition->bucket file names) and (Partition -> bucket number)
if (partitions.isEmpty()) {
if (!alias.equals(baseBigAlias)) {
tblAliasToNumberOfBucketsInEachPartition.put(alias, Arrays.<Integer>asList());
tblAliasToBucketedFilePathsInEachPartition.put(alias, new ArrayList<List<String>>());
}
} else {
List<Integer> buckets = new ArrayList<Integer>();
List<List<String>> files = new ArrayList<List<String>>();
for (Partition p : partitions) {
if (!checkBucketColumns(p.getBucketCols(), keys, joinKeyOrder)) {
return false;
}
List<String> fileNames = getBucketFilePathsOfPartition(p.getDataLocation(), pGraphContext);
// The number of files for the table should be same as number of buckets.
int bucketCount = p.getBucketCount();
if (fileNames.size() != 0 && fileNames.size() != bucketCount) {
String msg = "The number of buckets for table " + tbl.getTableName() + " partition " + p.getName() + " is " + p.getBucketCount() + ", whereas the number of files is " + fileNames.size();
throw new SemanticException(ErrorMsg.BUCKETED_TABLE_METADATA_INCORRECT.getMsg(msg));
}
if (alias.equals(baseBigAlias)) {
bigTblPartsToBucketFileNames.put(p, fileNames);
bigTblPartsToBucketNumber.put(p, bucketCount);
} else {
files.add(fileNames);
buckets.add(bucketCount);
}
}
if (!alias.equals(baseBigAlias)) {
tblAliasToNumberOfBucketsInEachPartition.put(alias, buckets);
tblAliasToBucketedFilePathsInEachPartition.put(alias, files);
}
}
} else {
if (!checkBucketColumns(tbl.getBucketCols(), keys, joinKeyOrder)) {
return false;
}
List<String> fileNames = getBucketFilePathsOfPartition(tbl.getDataLocation(), pGraphContext);
Integer num = new Integer(tbl.getNumBuckets());
// The number of files for the table should be same as number of buckets.
if (fileNames.size() != 0 && fileNames.size() != num) {
String msg = "The number of buckets for table " + tbl.getTableName() + " is " + tbl.getNumBuckets() + ", whereas the number of files is " + fileNames.size();
throw new SemanticException(ErrorMsg.BUCKETED_TABLE_METADATA_INCORRECT.getMsg(msg));
}
if (alias.equals(baseBigAlias)) {
bigTblPartsToBucketFileNames.put(null, fileNames);
bigTblPartsToBucketNumber.put(null, tbl.getNumBuckets());
bigTablePartitioned = false;
} else {
tblAliasToNumberOfBucketsInEachPartition.put(alias, Arrays.asList(num));
tblAliasToBucketedFilePathsInEachPartition.put(alias, Arrays.asList(fileNames));
}
}
}
// the big table can be divided by no of buckets in small tables.
for (Integer numBucketsInPartitionOfBigTable : bigTblPartsToBucketNumber.values()) {
if (!checkNumberOfBucketsAgainstBigTable(tblAliasToNumberOfBucketsInEachPartition, numBucketsInPartitionOfBigTable)) {
return false;
}
}
context.setTblAliasToNumberOfBucketsInEachPartition(tblAliasToNumberOfBucketsInEachPartition);
context.setTblAliasToBucketedFilePathsInEachPartition(tblAliasToBucketedFilePathsInEachPartition);
context.setBigTblPartsToBucketFileNames(bigTblPartsToBucketFileNames);
context.setBigTblPartsToBucketNumber(bigTblPartsToBucketNumber);
context.setJoinAliases(joinAliases);
context.setBaseBigAlias(baseBigAlias);
context.setBigTablePartitioned(bigTablePartitioned);
if (!aliasToNewAliasMap.isEmpty()) {
context.setAliasToNewAliasMap(aliasToNewAliasMap);
}
return true;
}
use of org.apache.hadoop.hive.ql.exec.TableScanOperator in project hive by apache.
the class AbstractSMBJoinProc method isEligibleForBucketSortMergeJoin.
/**
* Whether this table is eligible for a sort-merge join.
*
* @param pctx parse context
* @param op map join operator being considered
* @param joinTree join tree being considered
* @param alias table alias in the join tree being checked
* @param pos position of the table
* @param sortColumnsFirstTable The names and order of the sorted columns for the first table.
* It is not initialized when pos = 0.
* @return
* @throws SemanticException
*/
private boolean isEligibleForBucketSortMergeJoin(SortBucketJoinProcCtx smbJoinContext, List<ExprNodeDesc> keys, Map<String, Operator<? extends OperatorDesc>> aliasToOpInfo, String[] aliases, int pos, List<Order> sortColumnsFirstTable) throws SemanticException {
String alias = aliases[pos];
/*
* Consider a query like:
*
* select -- mapjoin(subq1) -- * from
* (select a.key, a.value from tbl1 a) subq1
* join
* (select a.key, a.value from tbl2 a) subq2
* on subq1.key = subq2.key;
*
* aliasToOpInfo contains the SelectOperator for subq1 and subq2.
* We need to traverse the tree (using TableAccessAnalyzer) to get to the base
* table. If the object being map-joined is a base table, then aliasToOpInfo
* contains the TableScanOperator, and TableAccessAnalyzer is a no-op.
*/
Operator<? extends OperatorDesc> topOp = aliasToOpInfo.get(alias);
if (topOp == null) {
return false;
}
// get all join columns from join keys
List<String> joinCols = toColumns(keys);
if (joinCols == null || joinCols.isEmpty()) {
return false;
}
TableScanOperator tso = TableAccessAnalyzer.genRootTableScan(topOp, joinCols);
if (tso == null) {
return false;
}
/*
* Consider a query like:
*
* select count(*) from
* (
* select key, count(*) from
* (
* select --mapjoin(a)-- a.key as key, a.value as val1, b.value as val2
* from tbl1 a join tbl2 b on a.key = b.key
* ) subq1
* group by key
* ) subq2;
*
* The table alias should be subq2:subq1:a which needs to be fetched from topOps.
*/
if (pGraphContext.getTopOps().containsValue(tso)) {
for (Map.Entry<String, TableScanOperator> topOpEntry : this.pGraphContext.getTopOps().entrySet()) {
if (topOpEntry.getValue() == tso) {
alias = topOpEntry.getKey();
aliases[pos] = alias;
break;
}
}
} else {
// Ideally, this should never happen, and this should be an assert.
return false;
}
Table tbl = tso.getConf().getTableMetadata();
if (tbl.isPartitioned()) {
PrunedPartitionList prunedParts = pGraphContext.getPrunedPartitions(alias, tso);
List<Partition> partitions = prunedParts.getNotDeniedPartns();
// first table
if ((pos == 0) && (partitions != null) && (!partitions.isEmpty())) {
Partition firstPartition = partitions.get(0);
sortColumnsFirstTable.addAll(firstPartition.getSortCols());
}
for (Partition partition : prunedParts.getNotDeniedPartns()) {
if (!checkSortColsAndJoinCols(partition.getSortCols(), joinCols, sortColumnsFirstTable)) {
return false;
}
}
return true;
}
// Populate the names and order of columns for the first table
if (pos == 0) {
sortColumnsFirstTable.addAll(tbl.getSortCols());
}
return checkSortColsAndJoinCols(tbl.getSortCols(), joinCols, sortColumnsFirstTable);
}
Aggregations