use of org.apache.hadoop.hive.ql.plan.ReduceSinkDesc in project hive by apache.
the class SemanticAnalyzer method genJoinOperatorChildren.
private Operator genJoinOperatorChildren(QBJoinTree join, Operator left, Operator[] right, HashSet<Integer> omitOpts, ExprNodeDesc[][] joinKeys) throws SemanticException {
RowResolver outputRR = new RowResolver();
ArrayList<String> outputColumnNames = new ArrayList<String>();
// all children are base classes
Operator<?>[] rightOps = new Operator[right.length];
int outputPos = 0;
Map<String, Byte> reversedExprs = new HashMap<String, Byte>();
HashMap<Byte, List<ExprNodeDesc>> exprMap = new HashMap<Byte, List<ExprNodeDesc>>();
Map<String, ExprNodeDesc> colExprMap = new HashMap<String, ExprNodeDesc>();
HashMap<Integer, Set<String>> posToAliasMap = new HashMap<Integer, Set<String>>();
HashMap<Byte, List<ExprNodeDesc>> filterMap = new HashMap<Byte, List<ExprNodeDesc>>();
// Only used for semijoin with residual predicates
List<ColumnInfo> topSelectInputColumns = new ArrayList<>();
for (int pos = 0; pos < right.length; ++pos) {
Operator<?> input = right[pos] == null ? left : right[pos];
if (input == null) {
input = left;
}
ReduceSinkOperator rs = (ReduceSinkOperator) input;
if (rs.getNumParent() != 1) {
throw new SemanticException("RS should have single parent");
}
Operator<?> parent = rs.getParentOperators().get(0);
ReduceSinkDesc rsDesc = (ReduceSinkDesc) (input.getConf());
int[] index = rs.getValueIndex();
ArrayList<ExprNodeDesc> valueDesc = new ArrayList<ExprNodeDesc>();
ArrayList<ExprNodeDesc> filterDesc = new ArrayList<ExprNodeDesc>();
Byte tag = (byte) rsDesc.getTag();
// we will add a Select on top of the join
if (omitOpts != null && omitOpts.contains(pos) && join.getPostJoinFilters().size() == 0) {
exprMap.put(tag, valueDesc);
filterMap.put(tag, filterDesc);
rightOps[pos] = input;
continue;
}
List<String> keyColNames = rsDesc.getOutputKeyColumnNames();
List<String> valColNames = rsDesc.getOutputValueColumnNames();
// prepare output descriptors for the input opt
RowResolver inputRR = opParseCtx.get(input).getRowResolver();
RowResolver parentRR = opParseCtx.get(parent).getRowResolver();
posToAliasMap.put(pos, new HashSet<String>(inputRR.getTableNames()));
List<ColumnInfo> columns = parentRR.getColumnInfos();
for (int i = 0; i < index.length; i++) {
ColumnInfo prev = columns.get(i);
String[] nm = parentRR.reverseLookup(prev.getInternalName());
String[] nm2 = parentRR.getAlternateMappings(prev.getInternalName());
if (outputRR.get(nm[0], nm[1]) != null) {
continue;
}
ColumnInfo info = new ColumnInfo(prev);
String field;
if (index[i] >= 0) {
field = Utilities.ReduceField.KEY + "." + keyColNames.get(index[i]);
} else {
field = Utilities.ReduceField.VALUE + "." + valColNames.get(-index[i] - 1);
}
String internalName = getColumnInternalName(outputColumnNames.size());
ExprNodeColumnDesc desc = new ExprNodeColumnDesc(info.getType(), field, info.getTabAlias(), info.getIsVirtualCol());
info.setInternalName(internalName);
colExprMap.put(internalName, desc);
outputRR.put(nm[0], nm[1], info);
if (nm2 != null) {
outputRR.addMappingOnly(nm2[0], nm2[1], info);
}
valueDesc.add(desc);
outputColumnNames.add(internalName);
reversedExprs.put(internalName, tag);
// Populate semijoin select if needed
if (omitOpts == null || !omitOpts.contains(pos)) {
topSelectInputColumns.add(info);
}
}
for (ASTNode cond : join.getFilters().get(tag)) {
filterDesc.add(genExprNodeDesc(cond, inputRR));
}
exprMap.put(tag, valueDesc);
filterMap.put(tag, filterDesc);
rightOps[pos] = input;
}
JoinCondDesc[] joinCondns = new JoinCondDesc[join.getJoinCond().length];
for (int i = 0; i < join.getJoinCond().length; i++) {
JoinCond condn = join.getJoinCond()[i];
joinCondns[i] = new JoinCondDesc(condn);
}
JoinDesc desc = new JoinDesc(exprMap, outputColumnNames, join.getNoOuterJoin(), joinCondns, filterMap, joinKeys, null);
desc.setReversedExprs(reversedExprs);
desc.setFilterMap(join.getFilterMap());
// Add filters that apply to more than one input
if (join.getPostJoinFilters().size() != 0 && (!join.getNoOuterJoin() || !join.getNoSemiJoin() || HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_PUSH_RESIDUAL_INNER))) {
LOG.debug("Generate JOIN with post-filtering conditions");
List<ExprNodeDesc> residualFilterExprs = new ArrayList<ExprNodeDesc>();
for (ASTNode cond : join.getPostJoinFilters()) {
residualFilterExprs.add(genExprNodeDesc(cond, outputRR, false, isCBOExecuted()));
}
desc.setResidualFilterExprs(residualFilterExprs);
// Clean post-conditions
join.getPostJoinFilters().clear();
}
JoinOperator joinOp = (JoinOperator) OperatorFactory.getAndMakeChild(getOpContext(), desc, new RowSchema(outputRR.getColumnInfos()), rightOps);
joinOp.setColumnExprMap(colExprMap);
joinOp.setPosToAliasMap(posToAliasMap);
if (join.getNullSafes() != null) {
boolean[] nullsafes = new boolean[join.getNullSafes().size()];
for (int i = 0; i < nullsafes.length; i++) {
nullsafes[i] = join.getNullSafes().get(i);
}
desc.setNullSafes(nullsafes);
}
Operator<?> topOp = putOpInsertMap(joinOp, outputRR);
if (omitOpts != null && !omitOpts.isEmpty() && desc.getResidualFilterExprs() != null && !desc.getResidualFilterExprs().isEmpty()) {
// Adding a select operator to top of semijoin to ensure projection of only correct columns
final List<ExprNodeDesc> topSelectExprs = new ArrayList<>();
final List<String> topSelectOutputColNames = new ArrayList<>();
final RowResolver topSelectRR = new RowResolver();
final Map<String, ExprNodeDesc> topSelectColExprMap = new HashMap<String, ExprNodeDesc>();
for (ColumnInfo colInfo : topSelectInputColumns) {
ExprNodeColumnDesc columnExpr = new ExprNodeColumnDesc(colInfo);
topSelectExprs.add(columnExpr);
topSelectOutputColNames.add(colInfo.getInternalName());
topSelectColExprMap.put(colInfo.getInternalName(), columnExpr);
String[] nm = outputRR.reverseLookup(columnExpr.getColumn());
String[] nm2 = outputRR.getAlternateMappings(columnExpr.getColumn());
topSelectRR.put(nm[0], nm[1], colInfo);
if (nm2 != null) {
topSelectRR.addMappingOnly(nm2[0], nm2[1], colInfo);
}
}
final SelectDesc topSelect = new SelectDesc(topSelectExprs, topSelectOutputColNames);
topOp = putOpInsertMap(OperatorFactory.getAndMakeChild(topSelect, new RowSchema(topSelectRR.getColumnInfos()), topOp), topSelectRR);
topOp.setColumnExprMap(topSelectColExprMap);
}
return topOp;
}
use of org.apache.hadoop.hive.ql.plan.ReduceSinkDesc in project hive by apache.
the class HiveOpConverterUtils method genReduceSink.
@SuppressWarnings({ "rawtypes", "unchecked" })
static ReduceSinkOperator genReduceSink(Operator<?> input, String tableAlias, ExprNodeDesc[] keys, int tag, ArrayList<ExprNodeDesc> partitionCols, String order, String nullOrder, int numReducers, Operation acidOperation, HiveConf hiveConf) throws SemanticException {
// dummy for backtracking
Operator dummy = Operator.createDummy();
dummy.setParentOperators(Arrays.asList(input));
ArrayList<ExprNodeDesc> reduceKeys = new ArrayList<ExprNodeDesc>();
ArrayList<ExprNodeDesc> reduceKeysBack = new ArrayList<ExprNodeDesc>();
// Compute join keys and store in reduceKeys
for (ExprNodeDesc key : keys) {
reduceKeys.add(key);
reduceKeysBack.add(ExprNodeDescUtils.backtrack(key, dummy, input));
}
// Walk over the input schema and copy in the output
ArrayList<ExprNodeDesc> reduceValues = new ArrayList<ExprNodeDesc>();
ArrayList<ExprNodeDesc> reduceValuesBack = new ArrayList<ExprNodeDesc>();
Map<String, ExprNodeDesc> colExprMap = new HashMap<String, ExprNodeDesc>();
List<ColumnInfo> inputColumns = input.getSchema().getSignature();
ArrayList<ColumnInfo> outputColumns = new ArrayList<ColumnInfo>();
List<String> outputColumnNames = new ArrayList<String>();
int[] index = new int[inputColumns.size()];
for (int i = 0; i < inputColumns.size(); i++) {
ColumnInfo colInfo = inputColumns.get(i);
String outputColName = colInfo.getInternalName();
ExprNodeColumnDesc expr = new ExprNodeColumnDesc(colInfo);
// backtrack can be null when input is script operator
ExprNodeDesc exprBack = ExprNodeDescUtils.backtrack(expr, dummy, input);
int kindex = exprBack == null ? -1 : ExprNodeDescUtils.indexOf(exprBack, reduceKeysBack);
if (kindex >= 0) {
ColumnInfo newColInfo = new ColumnInfo(colInfo);
newColInfo.setInternalName(Utilities.ReduceField.KEY + ".reducesinkkey" + kindex);
newColInfo.setAlias(outputColName);
newColInfo.setTabAlias(tableAlias);
outputColumns.add(newColInfo);
index[i] = kindex;
continue;
}
int vindex = exprBack == null ? -1 : ExprNodeDescUtils.indexOf(exprBack, reduceValuesBack);
if (vindex >= 0) {
index[i] = -vindex - 1;
continue;
}
index[i] = -reduceValues.size() - 1;
reduceValues.add(expr);
reduceValuesBack.add(exprBack);
ColumnInfo newColInfo = new ColumnInfo(colInfo);
newColInfo.setInternalName(Utilities.ReduceField.VALUE + "." + outputColName);
newColInfo.setAlias(outputColName);
newColInfo.setTabAlias(tableAlias);
outputColumns.add(newColInfo);
outputColumnNames.add(outputColName);
}
dummy.setParentOperators(null);
// Use only 1 reducer if no reduce keys
if (reduceKeys.size() == 0) {
numReducers = 1;
// Cartesian product is not supported in strict mode
String error = StrictChecks.checkCartesian(hiveConf);
if (error != null) {
throw new SemanticException(error);
}
}
ReduceSinkDesc rsDesc;
if (order.isEmpty()) {
rsDesc = PlanUtils.getReduceSinkDesc(reduceKeys, reduceValues, outputColumnNames, false, tag, reduceKeys.size(), numReducers, acidOperation, NullOrdering.defaultNullOrder(hiveConf));
} else {
rsDesc = PlanUtils.getReduceSinkDesc(reduceKeys, reduceValues, outputColumnNames, false, tag, partitionCols, order, nullOrder, NullOrdering.defaultNullOrder(hiveConf), numReducers, acidOperation, false);
}
ReduceSinkOperator rsOp = (ReduceSinkOperator) OperatorFactory.getAndMakeChild(rsDesc, new RowSchema(outputColumns), input);
List<String> keyColNames = rsDesc.getOutputKeyColumnNames();
for (int i = 0; i < keyColNames.size(); i++) {
colExprMap.put(Utilities.ReduceField.KEY + "." + keyColNames.get(i), reduceKeys.get(i));
}
List<String> valColNames = rsDesc.getOutputValueColumnNames();
for (int i = 0; i < valColNames.size(); i++) {
colExprMap.put(Utilities.ReduceField.VALUE + "." + valColNames.get(i), reduceValues.get(i));
}
rsOp.setValueIndex(index);
rsOp.setColumnExprMap(colExprMap);
rsOp.setInputAliases(input.getSchema().getTableNames().toArray(new String[input.getSchema().getTableNames().size()]));
if (LOG.isDebugEnabled()) {
LOG.debug("Generated " + rsOp + " with row schema: [" + rsOp.getSchema() + "]");
}
return rsOp;
}
use of org.apache.hadoop.hive.ql.plan.ReduceSinkDesc in project hive by apache.
the class ReduceSinkDeDuplicationUtils method merge.
// for JOIN-RS case, it's not possible generally to merge if child has
// less key/partition columns than parents
public static boolean merge(ReduceSinkOperator cRS, JoinOperator pJoin, int minReducer) throws SemanticException {
List<Operator<?>> parents = pJoin.getParentOperators();
ReduceSinkOperator[] pRSs = parents.toArray(new ReduceSinkOperator[parents.size()]);
ReduceSinkDesc cRSc = cRS.getConf();
for (ReduceSinkOperator pRSNs : pRSs) {
ReduceSinkDesc pRSNc = pRSNs.getConf();
if (cRSc.getKeyCols().size() != pRSNc.getKeyCols().size()) {
return false;
}
if (cRSc.getPartitionCols().size() != pRSNc.getPartitionCols().size()) {
return false;
}
Integer moveReducerNumTo = checkNumReducer(cRSc.getNumReducers(), pRSNc.getNumReducers());
if (moveReducerNumTo == null || moveReducerNumTo > 0 && cRSc.getNumReducers() < minReducer) {
return false;
}
Integer moveRSOrderTo = checkOrder(true, cRSc.getOrder(), pRSNc.getOrder(), cRSc.getNullOrder(), pRSNc.getNullOrder());
if (moveRSOrderTo == null) {
return false;
}
}
boolean[] sorted = CorrelationUtilities.getSortedTags(pJoin);
int cKeySize = cRSc.getKeyCols().size();
for (int i = 0; i < cKeySize; i++) {
ExprNodeDesc cexpr = cRSc.getKeyCols().get(i);
ExprNodeDesc[] pexprs = new ExprNodeDesc[pRSs.length];
for (int tag = 0; tag < pRSs.length; tag++) {
pexprs[tag] = pRSs[tag].getConf().getKeyCols().get(i);
}
int found = CorrelationUtilities.indexOf(cexpr, pexprs, cRS, pRSs, sorted);
if (found != i) {
return false;
}
}
int cPartSize = cRSc.getPartitionCols().size();
for (int i = 0; i < cPartSize; i++) {
ExprNodeDesc cexpr = cRSc.getPartitionCols().get(i);
ExprNodeDesc[] pexprs = new ExprNodeDesc[pRSs.length];
for (int tag = 0; tag < pRSs.length; tag++) {
pexprs[tag] = pRSs[tag].getConf().getPartitionCols().get(i);
}
int found = CorrelationUtilities.indexOf(cexpr, pexprs, cRS, pRSs, sorted);
if (found != i) {
return false;
}
}
for (ReduceSinkOperator pRS : pRSs) {
pRS.getConf().setNumReducers(cRS.getConf().getNumReducers());
}
return true;
}
use of org.apache.hadoop.hive.ql.plan.ReduceSinkDesc in project hive by apache.
the class TestExecDriver method populateMapRedPlan6.
@SuppressWarnings("unchecked")
private void populateMapRedPlan6(Table src) throws Exception {
// map-side work
ArrayList<String> outputColumns = new ArrayList<String>();
for (int i = 0; i < 2; i++) {
outputColumns.add("_col" + i);
}
Operator<ReduceSinkDesc> op1 = OperatorFactory.get(ctx, PlanUtils.getReduceSinkDesc(Utilities.makeList(getStringColumn("tkey")), Utilities.makeList(getStringColumn("tkey"), getStringColumn("tvalue")), outputColumns, false, -1, 1, -1, AcidUtils.Operation.NOT_ACID, NullOrdering.NULLS_LAST));
Operator<ScriptDesc> op0 = OperatorFactory.get(new ScriptDesc("\'cat\'", PlanUtils.getDefaultTableDesc("" + Utilities.tabCode, "tkey,tvalue"), TextRecordWriter.class, PlanUtils.getDefaultTableDesc("" + Utilities.tabCode, "tkey,tvalue"), TextRecordReader.class, TextRecordReader.class, PlanUtils.getDefaultTableDesc("" + Utilities.tabCode, "key")), op1);
Operator<SelectDesc> op4 = OperatorFactory.get(new SelectDesc(Utilities.makeList(getStringColumn("key"), getStringColumn("value")), outputColumns), op0);
addMapWork(mr, src, "a", op4);
ReduceWork rWork = new ReduceWork();
mr.setReduceWork(rWork);
rWork.setNumReduceTasks(Integer.valueOf(1));
rWork.setKeyDesc(op1.getConf().getKeySerializeInfo());
rWork.getTagToValueDesc().add(op1.getConf().getValueSerializeInfo());
// reduce side work
Operator<FileSinkDesc> op3 = OperatorFactory.get(ctx, new FileSinkDesc(new Path(TMPDIR + File.separator + "mapredplan6.out"), Utilities.defaultTd, false));
Operator<FilterDesc> op2 = OperatorFactory.get(getTestFilterDesc("0"), op3);
List<ExprNodeDesc> cols = new ArrayList<ExprNodeDesc>();
cols.add(getStringColumn(Utilities.ReduceField.KEY + ".reducesinkkey" + 0));
cols.add(getStringColumn(Utilities.ReduceField.VALUE.toString() + "." + outputColumns.get(1)));
Operator<SelectDesc> op5 = OperatorFactory.get(new SelectDesc(cols, outputColumns), op2);
rWork.setReducer(op5);
}
use of org.apache.hadoop.hive.ql.plan.ReduceSinkDesc in project hive by apache.
the class TestExecDriver method populateMapRedPlan3.
/**
* test reduce with multiple tagged inputs.
*/
@SuppressWarnings("unchecked")
private void populateMapRedPlan3(Table src, Table src2) throws SemanticException {
List<String> outputColumns = new ArrayList<String>();
for (int i = 0; i < 2; i++) {
outputColumns.add("_col" + i);
}
// map-side work
Operator<ReduceSinkDesc> op1 = OperatorFactory.get(ctx, PlanUtils.getReduceSinkDesc(Utilities.makeList(getStringColumn("key")), Utilities.makeList(getStringColumn("value")), outputColumns, true, Byte.valueOf((byte) 0), 1, -1, AcidUtils.Operation.NOT_ACID, NullOrdering.NULLS_LAST));
addMapWork(mr, src, "a", op1);
Operator<ReduceSinkDesc> op2 = OperatorFactory.get(ctx, PlanUtils.getReduceSinkDesc(Utilities.makeList(getStringColumn("key")), Utilities.makeList(getStringColumn("key")), outputColumns, true, Byte.valueOf((byte) 1), Integer.MAX_VALUE, -1, AcidUtils.Operation.NOT_ACID, NullOrdering.NULLS_LAST));
addMapWork(mr, src2, "b", op2);
ReduceWork rWork = new ReduceWork();
rWork.setNumReduceTasks(Integer.valueOf(1));
rWork.setNeedsTagging(true);
rWork.setKeyDesc(op1.getConf().getKeySerializeInfo());
rWork.getTagToValueDesc().add(op1.getConf().getValueSerializeInfo());
mr.setReduceWork(rWork);
rWork.getTagToValueDesc().add(op2.getConf().getValueSerializeInfo());
// reduce side work
Operator<FileSinkDesc> op4 = OperatorFactory.get(ctx, new FileSinkDesc(new Path(TMPDIR + File.separator + "mapredplan3.out"), Utilities.defaultTd, false));
Operator<SelectDesc> op5 = OperatorFactory.get(new SelectDesc(Utilities.makeList(new ExprNodeFieldDesc(TypeInfoFactory.stringTypeInfo, new ExprNodeColumnDesc(TypeInfoFactory.getListTypeInfo(TypeInfoFactory.stringTypeInfo), Utilities.ReduceField.VALUE.toString(), "", false), "0", false)), Utilities.makeList(outputColumns.get(0))), op4);
rWork.setReducer(op5);
}
Aggregations