use of org.apache.hadoop.hive.ql.exec.ColumnInfo in project hive by apache.
the class HiveGBOpConvUtil method getReduceKeysForRS.
* Get Reduce Keys for RS following MapSide GB
* @param reduceKeys
* assumed to be deduped list of exprs
* @param outputKeyColumnNames
* @param colExprMap
* @return List of ExprNodeDesc of ReduceKeys
* @throws SemanticException
private static ArrayList<ExprNodeDesc> getReduceKeysForRS(Operator inOp, int startPos, int endPos, List<String> outputKeyColumnNames, boolean addOnlyOneKeyColName, ArrayList<ColumnInfo> colInfoLst, Map<String, ExprNodeDesc> colExprMap, boolean addEmptyTabAlias, boolean setColToNonVirtual) throws SemanticException {
ArrayList<ExprNodeDesc> reduceKeys = null;
if (endPos < 0) {
reduceKeys = new ArrayList<ExprNodeDesc>();
} else {
reduceKeys = ExprNodeDescUtils.genExprNodeDesc(inOp, startPos, endPos, addEmptyTabAlias, setColToNonVirtual);
int outColNameIndx = startPos;
for (int i = 0; i < reduceKeys.size(); ++i) {
String outputColName = SemanticAnalyzer.getColumnInternalName(outColNameIndx);
if (!addOnlyOneKeyColName || i == 0) {
// TODO: Verify if this is needed (Why can't it be always null/empty
String tabAlias = addEmptyTabAlias ? "" : null;
ColumnInfo colInfo = new ColumnInfo(Utilities.ReduceField.KEY.toString() + "." + outputColName, reduceKeys.get(i).getTypeInfo(), tabAlias, false);
colExprMap.put(colInfo.getInternalName(), reduceKeys.get(i));
return reduceKeys;
use of org.apache.hadoop.hive.ql.exec.ColumnInfo in project hive by apache.
the class HiveOpConverter method visit.
* TODO: 1. PPD needs to get pushed in to TS
* @param scanRel
* @return
OpAttr visit(HiveTableScan scanRel) {
if (LOG.isDebugEnabled()) {
LOG.debug("Translating operator rel#" + scanRel.getId() + ":" + scanRel.getRelTypeName() + " with row type: [" + scanRel.getRowType() + "]");
RelOptHiveTable ht = (RelOptHiveTable) scanRel.getTable();
// 1. Setup TableScan Desc
// 1.1 Build col details used by scan
ArrayList<ColumnInfo> colInfos = new ArrayList<ColumnInfo>();
List<VirtualColumn> virtualCols = new ArrayList<VirtualColumn>();
List<Integer> neededColumnIDs = new ArrayList<Integer>();
List<String> neededColumnNames = new ArrayList<String>();
Set<Integer> vcolsInCalcite = new HashSet<Integer>();
List<String> partColNames = new ArrayList<String>();
Map<Integer, VirtualColumn> VColsMap = HiveCalciteUtil.getVColsMap(ht.getVirtualCols(), ht.getNoOfNonVirtualCols());
Map<Integer, ColumnInfo> posToPartColInfo = ht.getPartColInfoMap();
Map<Integer, ColumnInfo> posToNonPartColInfo = ht.getNonPartColInfoMap();
List<Integer> neededColIndxsFrmReloptHT = scanRel.getNeededColIndxsFrmReloptHT();
List<String> scanColNames = scanRel.getRowType().getFieldNames();
String tableAlias = scanRel.getConcatQbIDAlias();
String colName;
ColumnInfo colInfo;
VirtualColumn vc;
for (int index = 0; index < scanRel.getRowType().getFieldList().size(); index++) {
colName = scanColNames.get(index);
if (VColsMap.containsKey(index)) {
vc = VColsMap.get(index);
colInfo = new ColumnInfo(vc.getName(), vc.getTypeInfo(), tableAlias, true, vc.getIsHidden());
} else if (posToPartColInfo.containsKey(index)) {
colInfo = posToPartColInfo.get(index);
} else {
colInfo = posToNonPartColInfo.get(index);
if (neededColIndxsFrmReloptHT.contains(index)) {
// 1.2 Create TableScanDesc
TableScanDesc tsd = new TableScanDesc(tableAlias, virtualCols, ht.getHiveTableMD());
// 1.3. Set Partition cols in TSDesc
// 1.4. Set needed cols in TSDesc
// 2. Setup TableScan
TableScanOperator ts = (TableScanOperator) OperatorFactory.get(semanticAnalyzer.getOpContext(), tsd, new RowSchema(colInfos));
// tablescan with same alias.
if (topOps.get(tableAlias) != null) {
tableAlias = tableAlias + this.uniqueCounter;
topOps.put(tableAlias, ts);
if (LOG.isDebugEnabled()) {
LOG.debug("Generated " + ts + " with row schema: [" + ts.getSchema() + "]");
return new OpAttr(tableAlias, vcolsInCalcite, ts);
use of org.apache.hadoop.hive.ql.exec.ColumnInfo in project hive by apache.
the class ExprNodeDescUtils method genExprNodeDesc.
* Build ExprNodeColumnDesc for the projections in the input operator from
* sartpos to endpos(both included). Operator must have an associated
* colExprMap.
* @param inputOp
* Input Hive Operator
* @param startPos
* starting position in the input operator schema; must be >=0 and <=
* endPos
* @param endPos
* end position in the input operator schema; must be >=0.
* @return List of ExprNodeDesc
public static ArrayList<ExprNodeDesc> genExprNodeDesc(Operator inputOp, int startPos, int endPos, boolean addEmptyTabAlias, boolean setColToNonVirtual) {
ArrayList<ExprNodeDesc> exprColLst = new ArrayList<ExprNodeDesc>();
List<ColumnInfo> colInfoLst = inputOp.getSchema().getSignature();
String tabAlias;
boolean vc;
ColumnInfo ci;
for (int i = startPos; i <= endPos; i++) {
ci = colInfoLst.get(i);
tabAlias = ci.getTabAlias();
if (addEmptyTabAlias) {
tabAlias = "";
vc = ci.getIsVirtualCol();
if (setColToNonVirtual) {
vc = false;
exprColLst.add(new ExprNodeColumnDesc(ci.getType(), ci.getInternalName(), tabAlias, vc));
return exprColLst;
use of org.apache.hadoop.hive.ql.exec.ColumnInfo in project hive by apache.
the class ExprProcFactory method getExprString.
* Get the expression string of an expression node.
public static String getExprString(RowSchema rs, ExprNodeDesc expr, LineageCtx lctx, Operator<? extends OperatorDesc> inpOp, Predicate cond) {
if (expr instanceof ExprNodeColumnDesc) {
ExprNodeColumnDesc col = (ExprNodeColumnDesc) expr;
String internalName = col.getColumn();
String alias = internalName;
String tabAlias = col.getTabAlias();
ColumnInfo ci = rs.getColumnInfo(internalName);
if (ci != null) {
if (ci.getAlias() != null) {
alias = ci.getAlias();
if (ci.getTabAlias() != null) {
tabAlias = ci.getTabAlias();
Dependency dep = lctx.getIndex().getDependency(inpOp, internalName);
if ((tabAlias == null || tabAlias.startsWith("_") || tabAlias.startsWith("$")) && (dep != null && dep.getType() == DependencyType.SIMPLE)) {
Set<BaseColumnInfo> baseCols = dep.getBaseCols();
if (baseCols != null && !baseCols.isEmpty()) {
BaseColumnInfo baseCol = baseCols.iterator().next();
tabAlias = baseCol.getTabAlias().getAlias();
alias = baseCol.getColumn().getName();
if (tabAlias != null && tabAlias.length() > 0 && !tabAlias.startsWith("_") && !tabAlias.startsWith("$")) {
if (cond != null && !findSourceColumn(lctx, cond, tabAlias, alias) && dep != null) {
return tabAlias + "." + alias;
if (dep != null) {
if (cond != null) {
if (dep.getExpr() != null) {
return dep.getExpr();
if (alias.startsWith("_")) {
ci = inpOp.getSchema().getColumnInfo(internalName);
if (ci != null && ci.getAlias() != null) {
alias = ci.getAlias();
return alias;
} else if (expr instanceof ExprNodeGenericFuncDesc) {
ExprNodeGenericFuncDesc func = (ExprNodeGenericFuncDesc) expr;
List<ExprNodeDesc> children = func.getChildren();
String[] childrenExprStrings = new String[children.size()];
for (int i = 0; i < childrenExprStrings.length; i++) {
childrenExprStrings[i] = getExprString(rs, children.get(i), lctx, inpOp, cond);
return func.getGenericUDF().getDisplayString(childrenExprStrings);
return expr.getExprString();
use of org.apache.hadoop.hive.ql.exec.ColumnInfo in project hive by apache.
the class GenMRSkewJoinProcessor method processSkewJoin.
* Create tasks for processing skew joins. The idea is (HIVE-964) to use
* separated jobs and map-joins to handle skew joins.
* <p>
* <ul>
* <li>
* Number of mr jobs to handle skew keys is the number of table minus 1 (we
* can stream the last table, so big keys in the last table will not be a
* problem).
* <li>
* At runtime in Join, we output big keys in one table into one corresponding
* directories, and all same keys in other tables into different dirs(one for
* each table). The directories will look like:
* <ul>
* <li>
* dir-T1-bigkeys(containing big keys in T1), dir-T2-keys(containing keys
* which is big in T1),dir-T3-keys(containing keys which is big in T1), ...
* <li>
* dir-T1-keys(containing keys which is big in T2), dir-T2-bigkeys(containing
* big keys in T2),dir-T3-keys(containing keys which is big in T2), ...
* <li>
* dir-T1-keys(containing keys which is big in T3), dir-T2-keys(containing big
* keys in T3),dir-T3-bigkeys(containing keys which is big in T3), ... .....
* </ul>
* </ul>
* For each table, we launch one mapjoin job, taking the directory containing
* big keys in this table and corresponding dirs in other tables as input.
* (Actally one job for one row in the above.)
* <p>
* For more discussions, please check
public static void processSkewJoin(JoinOperator joinOp, Task<? extends Serializable> currTask, ParseContext parseCtx) throws SemanticException {
// now does not work with outer joins
if (!GenMRSkewJoinProcessor.skewJoinEnabled(parseCtx.getConf(), joinOp)) {
List<Task<? extends Serializable>> children = currTask.getChildTasks();
Path baseTmpDir = parseCtx.getContext().getMRTmpPath();
JoinDesc joinDescriptor = joinOp.getConf();
Map<Byte, List<ExprNodeDesc>> joinValues = joinDescriptor.getExprs();
int numAliases = joinValues.size();
Map<Byte, Path> bigKeysDirMap = new HashMap<Byte, Path>();
Map<Byte, Map<Byte, Path>> smallKeysDirMap = new HashMap<Byte, Map<Byte, Path>>();
Map<Byte, Path> skewJoinJobResultsDir = new HashMap<Byte, Path>();
Byte[] tags = joinDescriptor.getTagOrder();
for (int i = 0; i < numAliases; i++) {
Byte alias = tags[i];
bigKeysDirMap.put(alias, getBigKeysDir(baseTmpDir, alias));
Map<Byte, Path> smallKeysMap = new HashMap<Byte, Path>();
smallKeysDirMap.put(alias, smallKeysMap);
for (Byte src2 : tags) {
if (!src2.equals(alias)) {
smallKeysMap.put(src2, getSmallKeysDir(baseTmpDir, alias, src2));
skewJoinJobResultsDir.put(alias, getBigKeysSkewJoinResultDir(baseTmpDir, alias));
joinDescriptor.setSkewKeyDefinition(HiveConf.getIntVar(parseCtx.getConf(), HiveConf.ConfVars.HIVESKEWJOINKEY));
HashMap<Path, Task<? extends Serializable>> bigKeysDirToTaskMap = new HashMap<Path, Task<? extends Serializable>>();
List<Serializable> listWorks = new ArrayList<Serializable>();
List<Task<? extends Serializable>> listTasks = new ArrayList<Task<? extends Serializable>>();
MapredWork currPlan = (MapredWork) currTask.getWork();
TableDesc keyTblDesc = (TableDesc) currPlan.getReduceWork().getKeyDesc().clone();
List<String> joinKeys = Utilities.getColumnNames(keyTblDesc.getProperties());
List<String> joinKeyTypes = Utilities.getColumnTypes(keyTblDesc.getProperties());
Map<Byte, TableDesc> tableDescList = new HashMap<Byte, TableDesc>();
Map<Byte, RowSchema> rowSchemaList = new HashMap<Byte, RowSchema>();
Map<Byte, List<ExprNodeDesc>> newJoinValues = new HashMap<Byte, List<ExprNodeDesc>>();
Map<Byte, List<ExprNodeDesc>> newJoinKeys = new HashMap<Byte, List<ExprNodeDesc>>();
// used for create mapJoinDesc, should be in order
List<TableDesc> newJoinValueTblDesc = new ArrayList<TableDesc>();
for (Byte tag : tags) {
for (int i = 0; i < numAliases; i++) {
Byte alias = tags[i];
List<ExprNodeDesc> valueCols = joinValues.get(alias);
String colNames = "";
String colTypes = "";
int columnSize = valueCols.size();
List<ExprNodeDesc> newValueExpr = new ArrayList<ExprNodeDesc>();
List<ExprNodeDesc> newKeyExpr = new ArrayList<ExprNodeDesc>();
ArrayList<ColumnInfo> columnInfos = new ArrayList<ColumnInfo>();
boolean first = true;
for (int k = 0; k < columnSize; k++) {
TypeInfo type = valueCols.get(k).getTypeInfo();
// any name, it does not matter.
String newColName = i + "_VALUE_" + k;
ColumnInfo columnInfo = new ColumnInfo(newColName, type, alias.toString(), false);
newValueExpr.add(new ExprNodeColumnDesc(columnInfo));
if (!first) {
colNames = colNames + ",";
colTypes = colTypes + ",";
first = false;
colNames = colNames + newColName;
colTypes = colTypes + valueCols.get(k).getTypeString();
// we are putting join keys at last part of the spilled table
for (int k = 0; k < joinKeys.size(); k++) {
if (!first) {
colNames = colNames + ",";
colTypes = colTypes + ",";
first = false;
colNames = colNames + joinKeys.get(k);
colTypes = colTypes + joinKeyTypes.get(k);
ColumnInfo columnInfo = new ColumnInfo(joinKeys.get(k), TypeInfoFactory.getPrimitiveTypeInfo(joinKeyTypes.get(k)), alias.toString(), false);
newKeyExpr.add(new ExprNodeColumnDesc(columnInfo));
newJoinValues.put(alias, newValueExpr);
newJoinKeys.put(alias, newKeyExpr);
tableDescList.put(alias, Utilities.getTableDesc(colNames, colTypes));
rowSchemaList.put(alias, new RowSchema(columnInfos));
// construct value table Desc
String valueColNames = "";
String valueColTypes = "";
first = true;
for (int k = 0; k < columnSize; k++) {
// any name, it does not matter.
String newColName = i + "_VALUE_" + k;
if (!first) {
valueColNames = valueColNames + ",";
valueColTypes = valueColTypes + ",";
valueColNames = valueColNames + newColName;
valueColTypes = valueColTypes + valueCols.get(k).getTypeString();
first = false;
newJoinValueTblDesc.set(Byte.valueOf((byte) i), Utilities.getTableDesc(valueColNames, valueColTypes));
for (int i = 0; i < numAliases - 1; i++) {
Byte src = tags[i];
MapWork newPlan = PlanUtils.getMapRedWork().getMapWork();
// This code has been only added for testing
boolean mapperCannotSpanPartns = parseCtx.getConf().getBoolVar(HiveConf.ConfVars.HIVE_MAPPER_CANNOT_SPAN_MULTIPLE_PARTITIONS);
MapredWork clonePlan = SerializationUtilities.clonePlan(currPlan);
Operator<? extends OperatorDesc>[] parentOps = new TableScanOperator[tags.length];
for (int k = 0; k < tags.length; k++) {
Operator<? extends OperatorDesc> ts = GenMapRedUtils.createTemporaryTableScanOperator(joinOp.getCompilationOpContext(), rowSchemaList.get((byte) k));
((TableScanOperator) ts).setTableDescSkewJoin(tableDescList.get((byte) k));
parentOps[k] = ts;
Operator<? extends OperatorDesc> tblScan_op = parentOps[i];
ArrayList<String> aliases = new ArrayList<String>();
String alias = src.toString().intern();
Path bigKeyDirPath = bigKeysDirMap.get(src);
newPlan.addPathToAlias(bigKeyDirPath, aliases);
newPlan.getAliasToWork().put(alias, tblScan_op);
PartitionDesc part = new PartitionDesc(tableDescList.get(src), null);
newPlan.addPathToPartitionInfo(bigKeyDirPath, part);
newPlan.getAliasToPartnInfo().put(alias, part);
Operator<? extends OperatorDesc> reducer = clonePlan.getReduceWork().getReducer();
assert reducer instanceof JoinOperator;
JoinOperator cloneJoinOp = (JoinOperator) reducer;
String dumpFilePrefix = "mapfile" + PlanUtils.getCountForMapJoinDumpFilePrefix();
MapJoinDesc mapJoinDescriptor = new MapJoinDesc(newJoinKeys, keyTblDesc, newJoinValues, newJoinValueTblDesc, newJoinValueTblDesc, joinDescriptor.getOutputColumnNames(), i, joinDescriptor.getConds(), joinDescriptor.getFilters(), joinDescriptor.getNoOuterJoin(), dumpFilePrefix, joinDescriptor.getMemoryMonitorInfo(), joinDescriptor.getInMemoryDataSize());
MapredLocalWork localPlan = new MapredLocalWork(new LinkedHashMap<String, Operator<? extends OperatorDesc>>(), new LinkedHashMap<String, FetchWork>());
Map<Byte, Path> smallTblDirs = smallKeysDirMap.get(src);
for (int j = 0; j < numAliases; j++) {
if (j == i) {
Byte small_alias = tags[j];
Operator<? extends OperatorDesc> tblScan_op2 = parentOps[j];
localPlan.getAliasToWork().put(small_alias.toString(), tblScan_op2);
Path tblDir = smallTblDirs.get(small_alias);
localPlan.getAliasToFetchWork().put(small_alias.toString(), new FetchWork(tblDir, tableDescList.get(small_alias)));
// construct a map join and set it as the child operator of tblScan_op
MapJoinOperator mapJoinOp = (MapJoinOperator) OperatorFactory.getAndMakeChild(joinOp.getCompilationOpContext(), mapJoinDescriptor, (RowSchema) null, parentOps);
// change the children of the original join operator to point to the map
// join operator
List<Operator<? extends OperatorDesc>> childOps = cloneJoinOp.getChildOperators();
for (Operator<? extends OperatorDesc> childOp : childOps) {
childOp.replaceParent(cloneJoinOp, mapJoinOp);
HiveConf jc = new HiveConf(parseCtx.getConf(), GenMRSkewJoinProcessor.class);
newPlan.setNumMapTasks(HiveConf.getIntVar(jc, HiveConf.ConfVars.HIVESKEWJOINMAPJOINNUMMAPTASK));
newPlan.setMinSplitSize(HiveConf.getLongVar(jc, HiveConf.ConfVars.HIVESKEWJOINMAPJOINMINSPLIT));
MapredWork w = new MapredWork();
Task<? extends Serializable> skewJoinMapJoinTask = TaskFactory.get(w);
bigKeysDirToTaskMap.put(bigKeyDirPath, skewJoinMapJoinTask);
if (children != null) {
for (Task<? extends Serializable> tsk : listTasks) {
for (Task<? extends Serializable> oldChild : children) {
currTask.setChildTasks(new ArrayList<Task<? extends Serializable>>());
for (Task<? extends Serializable> oldChild : children) {
ConditionalResolverSkewJoinCtx context = new ConditionalResolverSkewJoinCtx(bigKeysDirToTaskMap, children);
ConditionalWork cndWork = new ConditionalWork(listWorks);
ConditionalTask cndTsk = (ConditionalTask) TaskFactory.get(cndWork);
cndTsk.setResolver(new ConditionalResolverSkewJoin());
currTask.setChildTasks(new ArrayList<Task<? extends Serializable>>());