use of org.apache.hadoop.hive.ql.exec.RowSchema in project hive by apache.
the class MapJoinProcessor method convertSMBJoinToMapJoin.
/**
* convert a sortmerge join to a a map-side join.
*
* @param opParseCtxMap
* @param smbJoinOp
* join operator
* @param joinTree
* qb join tree
* @param bigTablePos
* position of the source to be read as part of map-reduce framework. All other sources
* are cached in memory
* @param noCheckOuterJoin
*/
public static MapJoinOperator convertSMBJoinToMapJoin(HiveConf hconf, SMBMapJoinOperator smbJoinOp, int bigTablePos, boolean noCheckOuterJoin) throws SemanticException {
// Create a new map join operator
SMBJoinDesc smbJoinDesc = smbJoinOp.getConf();
List<ExprNodeDesc> keyCols = smbJoinDesc.getKeys().get(Byte.valueOf((byte) 0));
TableDesc keyTableDesc = PlanUtils.getMapJoinKeyTableDesc(hconf, PlanUtils.getFieldSchemasFromColumnList(keyCols, MAPJOINKEY_FIELDPREFIX));
MapJoinDesc mapJoinDesc = new MapJoinDesc(smbJoinDesc.getKeys(), keyTableDesc, smbJoinDesc.getExprs(), smbJoinDesc.getValueTblDescs(), smbJoinDesc.getValueTblDescs(), smbJoinDesc.getOutputColumnNames(), bigTablePos, smbJoinDesc.getConds(), smbJoinDesc.getFilters(), smbJoinDesc.isNoOuterJoin(), smbJoinDesc.getDumpFilePrefix());
mapJoinDesc.setStatistics(smbJoinDesc.getStatistics());
RowSchema joinRS = smbJoinOp.getSchema();
// The mapjoin has the same schema as the join operator
MapJoinOperator mapJoinOp = (MapJoinOperator) OperatorFactory.getAndMakeChild(smbJoinOp.getCompilationOpContext(), mapJoinDesc, joinRS, new ArrayList<Operator<? extends OperatorDesc>>());
// change the children of the original join operator to point to the map
// join operator
List<Operator<? extends OperatorDesc>> childOps = smbJoinOp.getChildOperators();
for (Operator<? extends OperatorDesc> childOp : childOps) {
childOp.replaceParent(smbJoinOp, mapJoinOp);
}
mapJoinOp.setChildOperators(childOps);
smbJoinOp.setChildOperators(null);
// change the parent of the original SMBjoin operator to point to the map
// join operator
List<Operator<? extends OperatorDesc>> parentOps = smbJoinOp.getParentOperators();
for (Operator<? extends OperatorDesc> parentOp : parentOps) {
parentOp.replaceChild(smbJoinOp, mapJoinOp);
}
mapJoinOp.setParentOperators(parentOps);
smbJoinOp.setParentOperators(null);
return mapJoinOp;
}
use of org.apache.hadoop.hive.ql.exec.RowSchema in project hive by apache.
the class CorrelationUtilities method removeReduceSinkForGroupBy.
protected static void removeReduceSinkForGroupBy(ReduceSinkOperator cRS, GroupByOperator cGBYr, ParseContext context, AbstractCorrelationProcCtx procCtx) throws SemanticException {
Operator<?> parent = getSingleParent(cRS);
if ((parent instanceof GroupByOperator) && procCtx.isMapAggr()) {
// pRS-cGBYm-cRS-cGBYr (map aggregation) --> pRS-cGBYr(COMPLETE)
// copies desc of cGBYm to cGBYr and remove cGBYm and cRS
GroupByOperator cGBYm = (GroupByOperator) parent;
cGBYr.getConf().setKeys(ExprNodeDescUtils.backtrack(ExprNodeDescUtils.backtrack(cGBYr.getConf().getKeys(), cGBYr, cRS), cRS, cGBYm));
cGBYr.getConf().setAggregators(cGBYm.getConf().getAggregators());
for (AggregationDesc aggr : cGBYm.getConf().getAggregators()) {
aggr.setMode(GenericUDAFEvaluator.Mode.COMPLETE);
}
cGBYr.setColumnExprMap(cGBYm.getColumnExprMap());
cGBYr.setSchema(cGBYm.getSchema());
} else {
// pRS-cRS-cGBYr (no map aggregation) --> pRS-cGBYr(COMPLETE)
// revert expressions of cGBYr to that of cRS
cGBYr.getConf().setKeys(ExprNodeDescUtils.backtrack(cGBYr.getConf().getKeys(), cGBYr, cRS));
for (AggregationDesc aggr : cGBYr.getConf().getAggregators()) {
aggr.setParameters(ExprNodeDescUtils.backtrack(aggr.getParameters(), cGBYr, cRS));
}
Map<String, ExprNodeDesc> oldMap = cGBYr.getColumnExprMap();
RowSchema oldRS = cGBYr.getSchema();
Map<String, ExprNodeDesc> newMap = new HashMap<String, ExprNodeDesc>();
ArrayList<ColumnInfo> newRS = new ArrayList<ColumnInfo>();
List<String> outputCols = cGBYr.getConf().getOutputColumnNames();
for (int i = 0; i < outputCols.size(); i++) {
String colName = outputCols.get(i);
ColumnInfo colInfo = oldRS.getColumnInfo(colName);
newRS.add(colInfo);
ExprNodeDesc colExpr = ExprNodeDescUtils.backtrack(oldMap.get(colName), cGBYr, cRS);
if (colExpr != null) {
newMap.put(colInfo.getInternalName(), colExpr);
}
}
cGBYr.setColumnExprMap(newMap);
cGBYr.setSchema(new RowSchema(newRS));
}
cGBYr.getConf().setMode(GroupByDesc.Mode.COMPLETE);
removeOperator(cRS, cGBYr, parent, context);
procCtx.addRemovedOperator(cRS);
if ((parent instanceof GroupByOperator) && procCtx.isMapAggr()) {
removeOperator(parent, cGBYr, getSingleParent(parent), context);
procCtx.addRemovedOperator(cGBYr);
}
}
use of org.apache.hadoop.hive.ql.exec.RowSchema in project hive by apache.
the class HiveOpConverter method genReduceSinkAndBacktrackSelect.
private static SelectOperator genReduceSinkAndBacktrackSelect(Operator<?> input, ExprNodeDesc[] keys, int tag, ArrayList<ExprNodeDesc> partitionCols, String order, String nullOrder, int numReducers, Operation acidOperation, HiveConf hiveConf, List<String> keepColNames) throws SemanticException {
// 1. Generate RS operator
// 1.1 Prune the tableNames, only count the tableNames that are not empty strings
// as empty string in table aliases is only allowed for virtual columns.
String tableAlias = null;
Set<String> tableNames = input.getSchema().getTableNames();
for (String tableName : tableNames) {
if (tableName != null) {
if (tableName.length() == 0) {
if (tableAlias == null) {
tableAlias = tableName;
}
} else {
if (tableAlias == null || tableAlias.length() == 0) {
tableAlias = tableName;
} else {
if (!tableName.equals(tableAlias)) {
throw new SemanticException("In CBO return path, genReduceSinkAndBacktrackSelect is expecting only one tableAlias but there is more than one");
}
}
}
}
}
if (tableAlias == null) {
throw new SemanticException("In CBO return path, genReduceSinkAndBacktrackSelect is expecting only one tableAlias but there is none");
}
// 1.2 Now generate RS operator
ReduceSinkOperator rsOp = genReduceSink(input, tableAlias, keys, tag, partitionCols, order, nullOrder, numReducers, acidOperation, hiveConf);
// 2. Generate backtrack Select operator
Map<String, ExprNodeDesc> descriptors = buildBacktrackFromReduceSink(keepColNames, rsOp.getConf().getOutputKeyColumnNames(), rsOp.getConf().getOutputValueColumnNames(), rsOp.getValueIndex(), input);
SelectDesc selectDesc = new SelectDesc(new ArrayList<ExprNodeDesc>(descriptors.values()), new ArrayList<String>(descriptors.keySet()));
ArrayList<ColumnInfo> cinfoLst = createColInfosSubset(input, keepColNames);
SelectOperator selectOp = (SelectOperator) OperatorFactory.getAndMakeChild(selectDesc, new RowSchema(cinfoLst), rsOp);
selectOp.setColumnExprMap(descriptors);
if (LOG.isDebugEnabled()) {
LOG.debug("Generated " + selectOp + " with row schema: [" + selectOp.getSchema() + "]");
}
return selectOp;
}
use of org.apache.hadoop.hive.ql.exec.RowSchema in project hive by apache.
the class HiveOpConverter method genReduceSink.
@SuppressWarnings({ "rawtypes", "unchecked" })
private static ReduceSinkOperator genReduceSink(Operator<?> input, String tableAlias, ExprNodeDesc[] keys, int tag, ArrayList<ExprNodeDesc> partitionCols, String order, String nullOrder, int numReducers, Operation acidOperation, HiveConf hiveConf) throws SemanticException {
// dummy for backtracking
Operator dummy = Operator.createDummy();
dummy.setParentOperators(Arrays.asList(input));
ArrayList<ExprNodeDesc> reduceKeys = new ArrayList<ExprNodeDesc>();
ArrayList<ExprNodeDesc> reduceKeysBack = new ArrayList<ExprNodeDesc>();
// Compute join keys and store in reduceKeys
for (ExprNodeDesc key : keys) {
reduceKeys.add(key);
reduceKeysBack.add(ExprNodeDescUtils.backtrack(key, dummy, input));
}
// Walk over the input schema and copy in the output
ArrayList<ExprNodeDesc> reduceValues = new ArrayList<ExprNodeDesc>();
ArrayList<ExprNodeDesc> reduceValuesBack = new ArrayList<ExprNodeDesc>();
Map<String, ExprNodeDesc> colExprMap = new HashMap<String, ExprNodeDesc>();
List<ColumnInfo> inputColumns = input.getSchema().getSignature();
ArrayList<ColumnInfo> outputColumns = new ArrayList<ColumnInfo>();
List<String> outputColumnNames = new ArrayList<String>();
int[] index = new int[inputColumns.size()];
for (int i = 0; i < inputColumns.size(); i++) {
ColumnInfo colInfo = inputColumns.get(i);
String outputColName = colInfo.getInternalName();
ExprNodeColumnDesc expr = new ExprNodeColumnDesc(colInfo);
// backtrack can be null when input is script operator
ExprNodeDesc exprBack = ExprNodeDescUtils.backtrack(expr, dummy, input);
int kindex = exprBack == null ? -1 : ExprNodeDescUtils.indexOf(exprBack, reduceKeysBack);
if (kindex >= 0) {
ColumnInfo newColInfo = new ColumnInfo(colInfo);
newColInfo.setInternalName(Utilities.ReduceField.KEY + ".reducesinkkey" + kindex);
newColInfo.setAlias(outputColName);
newColInfo.setTabAlias(tableAlias);
outputColumns.add(newColInfo);
index[i] = kindex;
continue;
}
int vindex = exprBack == null ? -1 : ExprNodeDescUtils.indexOf(exprBack, reduceValuesBack);
if (vindex >= 0) {
index[i] = -vindex - 1;
continue;
}
index[i] = -reduceValues.size() - 1;
reduceValues.add(expr);
reduceValuesBack.add(exprBack);
ColumnInfo newColInfo = new ColumnInfo(colInfo);
newColInfo.setInternalName(Utilities.ReduceField.VALUE + "." + outputColName);
newColInfo.setAlias(outputColName);
newColInfo.setTabAlias(tableAlias);
outputColumns.add(newColInfo);
outputColumnNames.add(outputColName);
}
dummy.setParentOperators(null);
// Use only 1 reducer if no reduce keys
if (reduceKeys.size() == 0) {
numReducers = 1;
// Cartesian product is not supported in strict mode
String error = StrictChecks.checkCartesian(hiveConf);
if (error != null)
throw new SemanticException(error);
}
ReduceSinkDesc rsDesc;
if (order.isEmpty()) {
rsDesc = PlanUtils.getReduceSinkDesc(reduceKeys, reduceValues, outputColumnNames, false, tag, reduceKeys.size(), numReducers, acidOperation);
} else {
rsDesc = PlanUtils.getReduceSinkDesc(reduceKeys, reduceValues, outputColumnNames, false, tag, partitionCols, order, nullOrder, numReducers, acidOperation);
}
ReduceSinkOperator rsOp = (ReduceSinkOperator) OperatorFactory.getAndMakeChild(rsDesc, new RowSchema(outputColumns), input);
List<String> keyColNames = rsDesc.getOutputKeyColumnNames();
for (int i = 0; i < keyColNames.size(); i++) {
colExprMap.put(Utilities.ReduceField.KEY + "." + keyColNames.get(i), reduceKeys.get(i));
}
List<String> valColNames = rsDesc.getOutputValueColumnNames();
for (int i = 0; i < valColNames.size(); i++) {
colExprMap.put(Utilities.ReduceField.VALUE + "." + valColNames.get(i), reduceValues.get(i));
}
rsOp.setValueIndex(index);
rsOp.setColumnExprMap(colExprMap);
rsOp.setInputAliases(input.getSchema().getTableNames().toArray(new String[input.getSchema().getTableNames().size()]));
if (LOG.isDebugEnabled()) {
LOG.debug("Generated " + rsOp + " with row schema: [" + rsOp.getSchema() + "]");
}
return rsOp;
}
use of org.apache.hadoop.hive.ql.exec.RowSchema in project hive by apache.
the class HiveOpConverter method visit.
/**
* TODO: 1. PPD needs to get pushed in to TS
*
* @param scanRel
* @return
*/
OpAttr visit(HiveTableScan scanRel) {
if (LOG.isDebugEnabled()) {
LOG.debug("Translating operator rel#" + scanRel.getId() + ":" + scanRel.getRelTypeName() + " with row type: [" + scanRel.getRowType() + "]");
}
RelOptHiveTable ht = (RelOptHiveTable) scanRel.getTable();
// 1. Setup TableScan Desc
// 1.1 Build col details used by scan
ArrayList<ColumnInfo> colInfos = new ArrayList<ColumnInfo>();
List<VirtualColumn> virtualCols = new ArrayList<VirtualColumn>();
List<Integer> neededColumnIDs = new ArrayList<Integer>();
List<String> neededColumnNames = new ArrayList<String>();
Set<Integer> vcolsInCalcite = new HashSet<Integer>();
List<String> partColNames = new ArrayList<String>();
Map<Integer, VirtualColumn> VColsMap = HiveCalciteUtil.getVColsMap(ht.getVirtualCols(), ht.getNoOfNonVirtualCols());
Map<Integer, ColumnInfo> posToPartColInfo = ht.getPartColInfoMap();
Map<Integer, ColumnInfo> posToNonPartColInfo = ht.getNonPartColInfoMap();
List<Integer> neededColIndxsFrmReloptHT = scanRel.getNeededColIndxsFrmReloptHT();
List<String> scanColNames = scanRel.getRowType().getFieldNames();
String tableAlias = scanRel.getConcatQbIDAlias();
String colName;
ColumnInfo colInfo;
VirtualColumn vc;
for (int index = 0; index < scanRel.getRowType().getFieldList().size(); index++) {
colName = scanColNames.get(index);
if (VColsMap.containsKey(index)) {
vc = VColsMap.get(index);
virtualCols.add(vc);
colInfo = new ColumnInfo(vc.getName(), vc.getTypeInfo(), tableAlias, true, vc.getIsHidden());
vcolsInCalcite.add(index);
} else if (posToPartColInfo.containsKey(index)) {
partColNames.add(colName);
colInfo = posToPartColInfo.get(index);
vcolsInCalcite.add(index);
} else {
colInfo = posToNonPartColInfo.get(index);
}
colInfos.add(colInfo);
if (neededColIndxsFrmReloptHT.contains(index)) {
neededColumnIDs.add(index);
neededColumnNames.add(colName);
}
}
// 1.2 Create TableScanDesc
TableScanDesc tsd = new TableScanDesc(tableAlias, virtualCols, ht.getHiveTableMD());
// 1.3. Set Partition cols in TSDesc
tsd.setPartColumns(partColNames);
// 1.4. Set needed cols in TSDesc
tsd.setNeededColumnIDs(neededColumnIDs);
tsd.setNeededColumns(neededColumnNames);
// 2. Setup TableScan
TableScanOperator ts = (TableScanOperator) OperatorFactory.get(semanticAnalyzer.getOpContext(), tsd, new RowSchema(colInfos));
// tablescan with same alias.
if (topOps.get(tableAlias) != null) {
tableAlias = tableAlias + this.uniqueCounter;
}
topOps.put(tableAlias, ts);
if (LOG.isDebugEnabled()) {
LOG.debug("Generated " + ts + " with row schema: [" + ts.getSchema() + "]");
}
return new OpAttr(tableAlias, vcolsInCalcite, ts);
}
Aggregations