use of org.apache.hadoop.hive.ql.plan.ReduceSinkDesc in project hive by apache.
the class ReduceSinkDeDuplicationUtils method extractMergeDirections.
/**
* Returns merge directions between two RSs for criterias (ordering, number of reducers,
* reducer keys, partition keys). Returns null if any of categories is not mergeable.
*
* Values for each index can be -1, 0, 1
* 1. 0 means two configuration in the category is the same
* 2. for -1, configuration of parent RS is more specific than child RS
* 3. for 1, configuration of child RS is more specific than parent RS
*/
private static int[] extractMergeDirections(HiveConf hiveConf, ReduceSinkOperator cRS, ReduceSinkOperator pRS, int minReducer) throws SemanticException {
ReduceSinkDesc cConf = cRS.getConf();
ReduceSinkDesc pConf = pRS.getConf();
// If there is a PTF between cRS and pRS we cannot ignore the order direction
final boolean checkStrictEquality = isStrictEqualityNeeded(cRS, pRS);
Integer moveRSOrderTo = checkOrder(checkStrictEquality, cConf.getOrder(), pConf.getOrder(), cConf.getNullOrder(), pConf.getNullOrder());
if (moveRSOrderTo == null) {
return null;
}
// if cRS is being used for distinct - the two reduce sinks are incompatible
if (cConf.getDistinctColumnIndices().size() >= 2) {
return null;
}
if (cConf.getBucketingVersion() != pConf.getBucketingVersion()) {
return null;
}
Integer moveReducerNumTo = checkNumReducer(cConf.getNumReducers(), pConf.getNumReducers());
if (moveReducerNumTo == null || moveReducerNumTo > 0 && cConf.getNumReducers() < minReducer) {
return null;
}
List<ExprNodeDesc> ckeys = cConf.getKeyCols();
List<ExprNodeDesc> pkeys = pConf.getKeyCols();
Integer moveKeyColTo = checkExprs(ckeys, pkeys, cRS, pRS);
if (moveKeyColTo == null) {
return null;
}
List<ExprNodeDesc> cpars = cConf.getPartitionCols();
List<ExprNodeDesc> ppars = pConf.getPartitionCols();
Integer movePartitionColTo = checkExprs(cpars, ppars, cRS, pRS);
if (movePartitionColTo == null) {
return null;
}
if (canReplaceParentWithChildPartioning(movePartitionColTo, cpars, ppars)) {
long oldParallelism = estimateReducers(hiveConf, pRS);
long newParallelism = estimateReducers(hiveConf, cRS);
if (newParallelism < oldParallelism && newParallelism < minReducer) {
return null;
}
}
Integer moveNumDistKeyTo = checkNumDistributionKey(cConf.getNumDistributionKeys(), pConf.getNumDistributionKeys());
return new int[] { moveKeyColTo, movePartitionColTo, moveRSOrderTo, moveReducerNumTo, moveNumDistKeyTo };
}
use of org.apache.hadoop.hive.ql.plan.ReduceSinkDesc in project hive by apache.
the class ReduceSinkDeDuplicationUtils method strictMerge.
public static boolean strictMerge(ReduceSinkOperator cRS, List<ReduceSinkOperator> pRSs) throws SemanticException {
ReduceSinkDesc cRSc = cRS.getConf();
for (ReduceSinkOperator pRS : pRSs) {
ReduceSinkDesc pRSc = pRS.getConf();
if (cRSc.getKeyCols().size() != pRSc.getKeyCols().size()) {
return false;
}
if (cRSc.getPartitionCols().size() != pRSc.getPartitionCols().size()) {
return false;
}
Integer moveRSOrderTo = checkOrder(true, cRSc.getOrder(), pRSc.getOrder(), cRSc.getNullOrder(), pRSc.getNullOrder());
if (moveRSOrderTo == null) {
return false;
}
int cKeySize = cRSc.getKeyCols().size();
for (int i = 0; i < cKeySize; i++) {
ExprNodeDesc cExpr = cRSc.getKeyCols().get(i);
ExprNodeDesc pExpr = pRSc.getKeyCols().get(i);
if (cExpr instanceof ExprNodeConstantDesc || pExpr instanceof ExprNodeConstantDesc) {
// If ckeys or pkeys have constant node expressions avoid the merge.
return false;
}
ExprNodeDesc backtrackCExpr = ExprNodeDescUtils.backtrack(cExpr, cRS, pRS);
if (backtrackCExpr == null || !pExpr.isSame(backtrackCExpr)) {
return false;
}
}
int cPartSize = cRSc.getPartitionCols().size();
for (int i = 0; i < cPartSize; i++) {
ExprNodeDesc cExpr = cRSc.getPartitionCols().get(i);
ExprNodeDesc pExpr = pRSc.getPartitionCols().get(i);
if (cExpr instanceof ExprNodeConstantDesc || pExpr instanceof ExprNodeConstantDesc) {
// If cpartcols or ppartcols have constant node expressions avoid the merge.
return false;
}
ExprNodeDesc backtrackCExpr = ExprNodeDescUtils.backtrack(cExpr, cRS, pRS);
if (backtrackCExpr == null || !pExpr.isSame(backtrackCExpr)) {
return false;
}
}
if (cRSc.getBucketCols() != null || pRSc.getBucketCols() != null) {
if (cRSc.getBucketCols() == null || pRSc.getBucketCols() == null) {
return false;
}
if (cRSc.getBucketCols().size() != pRSc.getBucketCols().size()) {
return false;
}
int cBucketColsSize = cRSc.getBucketCols().size();
for (int i = 0; i < cBucketColsSize; i++) {
ExprNodeDesc cExpr = cRSc.getBucketCols().get(i);
ExprNodeDesc pExpr = pRSc.getBucketCols().get(i);
if (cExpr instanceof ExprNodeConstantDesc || pExpr instanceof ExprNodeConstantDesc) {
// If cbucketcols or pbucketcols have constant node expressions avoid the merge.
return false;
}
ExprNodeDesc backtrackCExpr = ExprNodeDescUtils.backtrack(cExpr, cRS, pRS);
if (backtrackCExpr == null || !pExpr.isSame(backtrackCExpr)) {
return false;
}
}
}
// Meets all requirements
return true;
}
// Default
return false;
}
use of org.apache.hadoop.hive.ql.plan.ReduceSinkDesc in project hive by apache.
the class ReduceSinkDeDuplicationUtils method aggressiveDedup.
protected static boolean aggressiveDedup(ReduceSinkOperator cRS, ReduceSinkOperator pRS, ReduceSinkDeduplicateProcCtx dedupCtx) throws SemanticException {
assert cRS.getNumParent() == 1;
ReduceSinkDesc cConf = cRS.getConf();
ReduceSinkDesc pConf = pRS.getConf();
List<ExprNodeDesc> cKeys = cConf.getKeyCols();
List<ExprNodeDesc> pKeys = pConf.getKeyCols();
if (!checkSelectSingleBranchOnly(cRS, pRS)) {
return false;
}
// If child keys are null or empty, we bail out
if (cKeys == null || cKeys.isEmpty()) {
return false;
}
// If parent keys are null or empty, we bail out
if (pKeys == null || pKeys.isEmpty()) {
return false;
}
// Backtrack key columns of cRS to pRS
// If we cannot backtrack any of the columns, bail out
List<ExprNodeDesc> cKeysInParentRS = ExprNodeDescUtils.backtrack(cKeys, cRS, pRS);
for (int i = 0; i < cKeysInParentRS.size(); i++) {
ExprNodeDesc pexpr = cKeysInParentRS.get(i);
if (pexpr == null) {
// We cannot backtrack the expression, we bail out
return false;
}
}
cRS.getConf().setKeyCols(cKeysInParentRS);
// Backtrack partition columns of cRS to pRS
// If we cannot backtrack any of the columns, bail out
List<ExprNodeDesc> cPartitionInParentRS = ExprNodeDescUtils.backtrack(cConf.getPartitionCols(), cRS, pRS);
for (int i = 0; i < cPartitionInParentRS.size(); i++) {
ExprNodeDesc pexpr = cPartitionInParentRS.get(i);
if (pexpr == null) {
// We cannot backtrack the expression, we bail out
return false;
}
}
cRS.getConf().setPartitionCols(cPartitionInParentRS);
// Backtrack value columns of cRS to pRS
// If we cannot backtrack any of the columns, bail out
List<ExprNodeDesc> cValueInParentRS = ExprNodeDescUtils.backtrack(cConf.getValueCols(), cRS, pRS);
for (int i = 0; i < cValueInParentRS.size(); i++) {
ExprNodeDesc pexpr = cValueInParentRS.get(i);
if (pexpr == null) {
// We cannot backtrack the expression, we bail out
return false;
}
}
cRS.getConf().setValueCols(cValueInParentRS);
// If we cannot backtrack any of the columns, bail out
if (cConf.getBucketCols() != null) {
List<ExprNodeDesc> cBucketInParentRS = ExprNodeDescUtils.backtrack(cConf.getBucketCols(), cRS, pRS);
for (int i = 0; i < cBucketInParentRS.size(); i++) {
ExprNodeDesc pexpr = cBucketInParentRS.get(i);
if (pexpr == null) {
// We cannot backtrack the expression, we bail out
return false;
}
}
cRS.getConf().setBucketCols(cBucketInParentRS);
}
// Update column expression map
for (Entry<String, ExprNodeDesc> e : cRS.getColumnExprMap().entrySet()) {
e.setValue(ExprNodeDescUtils.backtrack(e.getValue(), cRS, pRS));
}
// Replace pRS with cRS and remove operator sequence from pRS to cRS
// Recall that the sequence must be pRS-SEL*-cRS
Operator<? extends OperatorDesc> parent = cRS.getParentOperators().get(0);
while (parent != pRS) {
dedupCtx.addRemovedOperator(parent);
parent = parent.getParentOperators().get(0);
}
dedupCtx.addRemovedOperator(pRS);
cRS.getParentOperators().clear();
for (Operator<? extends OperatorDesc> op : pRS.getParentOperators()) {
op.replaceChild(pRS, cRS);
cRS.getParentOperators().add(op);
}
pRS.getParentOperators().clear();
pRS.getChildOperators().clear();
return true;
}
use of org.apache.hadoop.hive.ql.plan.ReduceSinkDesc in project hive by apache.
the class SemanticAnalyzer method genReduceSinkPlan.
@SuppressWarnings("nls")
private Operator genReduceSinkPlan(Operator<?> input, List<ExprNodeDesc> partitionCols, List<ExprNodeDesc> sortCols, String sortOrder, String nullOrder, int numReducers, AcidUtils.Operation acidOp, boolean pullConstants, boolean isCompaction) throws SemanticException {
RowResolver inputRR = opParseCtx.get(input).getRowResolver();
Operator dummy = Operator.createDummy();
dummy.setParentOperators(Arrays.asList(input));
List<ExprNodeDesc> newSortCols = new ArrayList<ExprNodeDesc>();
StringBuilder newSortOrder = new StringBuilder();
StringBuilder newNullOrder = new StringBuilder();
List<ExprNodeDesc> sortColsBack = new ArrayList<ExprNodeDesc>();
for (int i = 0; i < sortCols.size(); i++) {
ExprNodeDesc sortCol = sortCols.get(i);
// we are pulling constants but this is not a constant
if (!pullConstants || !(sortCol instanceof ExprNodeConstantDesc)) {
newSortCols.add(sortCol);
newSortOrder.append(sortOrder.charAt(i));
newNullOrder.append(nullOrder.charAt(i));
sortColsBack.add(ExprNodeDescUtils.backtrack(sortCol, dummy, input));
}
}
// For the generation of the values expression just get the inputs
// signature and generate field expressions for those
RowResolver rsRR = new RowResolver();
List<String> outputColumns = new ArrayList<String>();
List<ExprNodeDesc> valueCols = new ArrayList<ExprNodeDesc>();
List<ExprNodeDesc> valueColsBack = new ArrayList<ExprNodeDesc>();
Map<String, ExprNodeDesc> colExprMap = new HashMap<String, ExprNodeDesc>();
List<ExprNodeDesc> constantCols = new ArrayList<ExprNodeDesc>();
List<ColumnInfo> columnInfos = inputRR.getColumnInfos();
int[] index = new int[columnInfos.size()];
for (int i = 0; i < index.length; i++) {
ColumnInfo colInfo = columnInfos.get(i);
String[] nm = inputRR.reverseLookup(colInfo.getInternalName());
String[] nm2 = inputRR.getAlternateMappings(colInfo.getInternalName());
ExprNodeColumnDesc value = new ExprNodeColumnDesc(colInfo);
// backtrack can be null when input is script operator
ExprNodeDesc valueBack = ExprNodeDescUtils.backtrack(value, dummy, input);
if (pullConstants && valueBack instanceof ExprNodeConstantDesc) {
// ignore, it will be generated by SEL op
index[i] = Integer.MAX_VALUE;
constantCols.add(valueBack);
continue;
}
int kindex = valueBack == null ? -1 : ExprNodeDescUtils.indexOf(valueBack, sortColsBack);
if (kindex >= 0) {
index[i] = kindex;
ColumnInfo newColInfo = new ColumnInfo(colInfo);
newColInfo.setInternalName(Utilities.ReduceField.KEY + ".reducesinkkey" + kindex);
newColInfo.setTabAlias(nm[0]);
rsRR.put(nm[0], nm[1], newColInfo);
if (nm2 != null) {
rsRR.addMappingOnly(nm2[0], nm2[1], newColInfo);
}
continue;
}
int vindex = valueBack == null ? -1 : ExprNodeDescUtils.indexOf(valueBack, valueColsBack);
if (vindex >= 0) {
index[i] = -vindex - 1;
continue;
}
index[i] = -valueCols.size() - 1;
String outputColName = getColumnInternalName(valueCols.size());
valueCols.add(value);
valueColsBack.add(valueBack);
ColumnInfo newColInfo = new ColumnInfo(colInfo);
newColInfo.setInternalName(Utilities.ReduceField.VALUE + "." + outputColName);
newColInfo.setTabAlias(nm[0]);
rsRR.put(nm[0], nm[1], newColInfo);
if (nm2 != null) {
rsRR.addMappingOnly(nm2[0], nm2[1], newColInfo);
}
outputColumns.add(outputColName);
}
dummy.setParentOperators(null);
ReduceSinkDesc rsdesc = PlanUtils.getReduceSinkDesc(newSortCols, valueCols, outputColumns, false, -1, partitionCols, newSortOrder.toString(), newNullOrder.toString(), defaultNullOrder, numReducers, acidOp, isCompaction);
Operator interim = putOpInsertMap(OperatorFactory.getAndMakeChild(rsdesc, new RowSchema(rsRR.getColumnInfos()), input), rsRR);
List<String> keyColNames = rsdesc.getOutputKeyColumnNames();
for (int i = 0; i < keyColNames.size(); i++) {
colExprMap.put(Utilities.ReduceField.KEY + "." + keyColNames.get(i), newSortCols.get(i));
}
List<String> valueColNames = rsdesc.getOutputValueColumnNames();
for (int i = 0; i < valueColNames.size(); i++) {
colExprMap.put(Utilities.ReduceField.VALUE + "." + valueColNames.get(i), valueCols.get(i));
}
interim.setColumnExprMap(colExprMap);
RowResolver selectRR = new RowResolver();
List<ExprNodeDesc> selCols = new ArrayList<ExprNodeDesc>();
List<String> selOutputCols = new ArrayList<String>();
Map<String, ExprNodeDesc> selColExprMap = new HashMap<String, ExprNodeDesc>();
Iterator<ExprNodeDesc> constants = constantCols.iterator();
for (int i = 0; i < index.length; i++) {
ColumnInfo prev = columnInfos.get(i);
String[] nm = inputRR.reverseLookup(prev.getInternalName());
String[] nm2 = inputRR.getAlternateMappings(prev.getInternalName());
ColumnInfo info = new ColumnInfo(prev);
ExprNodeDesc desc;
if (index[i] == Integer.MAX_VALUE) {
desc = constants.next();
} else {
String field;
if (index[i] >= 0) {
field = Utilities.ReduceField.KEY + "." + keyColNames.get(index[i]);
} else {
field = Utilities.ReduceField.VALUE + "." + valueColNames.get(-index[i] - 1);
}
desc = new ExprNodeColumnDesc(info.getType(), field, info.getTabAlias(), info.getIsVirtualCol());
}
selCols.add(desc);
String internalName = getColumnInternalName(i);
info.setInternalName(internalName);
selectRR.put(nm[0], nm[1], info);
if (nm2 != null) {
selectRR.addMappingOnly(nm2[0], nm2[1], info);
}
selOutputCols.add(internalName);
selColExprMap.put(internalName, desc);
}
SelectDesc select = new SelectDesc(selCols, selOutputCols);
Operator output = putOpInsertMap(OperatorFactory.getAndMakeChild(select, new RowSchema(selectRR.getColumnInfos()), interim), selectRR);
output.setColumnExprMap(selColExprMap);
return output;
}
use of org.apache.hadoop.hive.ql.plan.ReduceSinkDesc in project hive by apache.
the class SemanticAnalyzer method genMaterializedViewDataOrgPlan.
private Operator genMaterializedViewDataOrgPlan(List<ColumnInfo> sortColInfos, List<ColumnInfo> distributeColInfos, RowResolver inputRR, Operator input) {
// In this case, we will introduce a RS and immediately after a SEL that restores
// the row schema to what follow-up operations are expecting
Set<String> keys = sortColInfos.stream().map(ColumnInfo::getInternalName).collect(Collectors.toSet());
Set<String> distributeKeys = distributeColInfos.stream().map(ColumnInfo::getInternalName).collect(Collectors.toSet());
List<ExprNodeDesc> keyCols = new ArrayList<>();
List<String> keyColNames = new ArrayList<>();
StringBuilder order = new StringBuilder();
StringBuilder nullOrder = new StringBuilder();
List<ExprNodeDesc> valCols = new ArrayList<>();
List<String> valColNames = new ArrayList<>();
List<ExprNodeDesc> partCols = new ArrayList<>();
Map<String, ExprNodeDesc> colExprMap = new HashMap<>();
Map<String, String> nameMapping = new HashMap<>();
// map _col0 to KEY._col0, etc
for (ColumnInfo ci : inputRR.getRowSchema().getSignature()) {
ExprNodeColumnDesc e = new ExprNodeColumnDesc(ci);
String columnName = ci.getInternalName();
if (keys.contains(columnName)) {
// key (sort column)
keyColNames.add(columnName);
keyCols.add(e);
colExprMap.put(Utilities.ReduceField.KEY + "." + columnName, e);
nameMapping.put(columnName, Utilities.ReduceField.KEY + "." + columnName);
order.append("+");
nullOrder.append("a");
} else {
// value
valColNames.add(columnName);
valCols.add(e);
colExprMap.put(Utilities.ReduceField.VALUE + "." + columnName, e);
nameMapping.put(columnName, Utilities.ReduceField.VALUE + "." + columnName);
}
if (distributeKeys.contains(columnName)) {
// distribute column
partCols.add(e.clone());
}
}
// Create Key/Value TableDesc. When the operator plan is split into MR tasks,
// the reduce operator will initialize Extract operator with information
// from Key and Value TableDesc
List<FieldSchema> fields = PlanUtils.getFieldSchemasFromColumnList(keyCols, keyColNames, 0, "");
TableDesc keyTable = PlanUtils.getReduceKeyTableDesc(fields, order.toString(), nullOrder.toString());
List<FieldSchema> valFields = PlanUtils.getFieldSchemasFromColumnList(valCols, valColNames, 0, "");
TableDesc valueTable = PlanUtils.getReduceValueTableDesc(valFields);
List<List<Integer>> distinctColumnIndices = new ArrayList<>();
// Number of reducers is set to default (-1)
ReduceSinkDesc rsConf = new ReduceSinkDesc(keyCols, keyCols.size(), valCols, keyColNames, distinctColumnIndices, valColNames, -1, partCols, -1, keyTable, valueTable, Operation.NOT_ACID);
RowResolver rsRR = new RowResolver();
List<ColumnInfo> rsSignature = new ArrayList<>();
for (int index = 0; index < input.getSchema().getSignature().size(); index++) {
ColumnInfo colInfo = new ColumnInfo(input.getSchema().getSignature().get(index));
String[] nm = inputRR.reverseLookup(colInfo.getInternalName());
String[] nm2 = inputRR.getAlternateMappings(colInfo.getInternalName());
colInfo.setInternalName(nameMapping.get(colInfo.getInternalName()));
rsSignature.add(colInfo);
rsRR.put(nm[0], nm[1], colInfo);
if (nm2 != null) {
rsRR.addMappingOnly(nm2[0], nm2[1], colInfo);
}
}
Operator<?> result = putOpInsertMap(OperatorFactory.getAndMakeChild(rsConf, new RowSchema(rsSignature), input), rsRR);
result.setColumnExprMap(colExprMap);
// Create SEL operator
RowResolver selRR = new RowResolver();
List<ColumnInfo> selSignature = new ArrayList<>();
List<ExprNodeDesc> columnExprs = new ArrayList<>();
List<String> colNames = new ArrayList<>();
Map<String, ExprNodeDesc> selColExprMap = new HashMap<>();
for (int index = 0; index < input.getSchema().getSignature().size(); index++) {
ColumnInfo colInfo = new ColumnInfo(input.getSchema().getSignature().get(index));
String[] nm = inputRR.reverseLookup(colInfo.getInternalName());
String[] nm2 = inputRR.getAlternateMappings(colInfo.getInternalName());
selSignature.add(colInfo);
selRR.put(nm[0], nm[1], colInfo);
if (nm2 != null) {
selRR.addMappingOnly(nm2[0], nm2[1], colInfo);
}
String colName = colInfo.getInternalName();
ExprNodeDesc exprNodeDesc;
if (keys.contains(colName)) {
exprNodeDesc = new ExprNodeColumnDesc(colInfo.getType(), ReduceField.KEY.toString() + "." + colName, null, false);
columnExprs.add(exprNodeDesc);
} else {
exprNodeDesc = new ExprNodeColumnDesc(colInfo.getType(), ReduceField.VALUE.toString() + "." + colName, null, false);
columnExprs.add(exprNodeDesc);
}
colNames.add(colName);
selColExprMap.put(colName, exprNodeDesc);
}
SelectDesc selConf = new SelectDesc(columnExprs, colNames);
result = putOpInsertMap(OperatorFactory.getAndMakeChild(selConf, new RowSchema(selSignature), result), selRR);
result.setColumnExprMap(selColExprMap);
return result;
}
Aggregations