use of org.apache.hadoop.hive.ql.plan.ReduceSinkDesc in project hive by apache.
the class TopNKeyPushdownProcessor method pushdownThroughLeftOuterJoin.
/**
* Push through LOJ. If TopNKey expression refers fully to expressions from left input, push
* with rewriting of expressions and remove from top of LOJ. If TopNKey expression has a prefix
* that refers to expressions from left input, push with rewriting of those expressions and keep
* on top of LOJ.
*
* @param topNKey TopNKey operator to push
* @throws SemanticException when removeChildAndAdoptItsChildren was not successful
*/
private void pushdownThroughLeftOuterJoin(TopNKeyOperator topNKey) throws SemanticException {
final TopNKeyDesc topNKeyDesc = topNKey.getConf();
final CommonJoinOperator<? extends JoinDesc> join = (CommonJoinOperator<? extends JoinDesc>) topNKey.getParentOperators().get(0);
final List<Operator<? extends OperatorDesc>> joinInputs = join.getParentOperators();
final ReduceSinkOperator reduceSinkOperator = (ReduceSinkOperator) joinInputs.get(0);
final ReduceSinkDesc reduceSinkDesc = reduceSinkOperator.getConf();
CommonKeyPrefix commonKeyPrefix = CommonKeyPrefix.map(mapUntilColumnEquals(topNKeyDesc.getKeyColumns(), join.getColumnExprMap()), topNKeyDesc.getColumnSortOrder(), topNKeyDesc.getNullOrder(), reduceSinkDesc.getKeyCols(), reduceSinkDesc.getColumnExprMap(), reduceSinkDesc.getOrder(), reduceSinkDesc.getNullOrder());
if (commonKeyPrefix.isEmpty() || commonKeyPrefix.size() == topNKeyDesc.getPartitionKeyColumns().size()) {
return;
}
LOG.debug("Pushing a copy of {} through {} and {}", topNKey.getName(), join.getName(), reduceSinkOperator.getName());
final TopNKeyDesc newTopNKeyDesc = topNKeyDesc.combine(commonKeyPrefix);
pushdown((TopNKeyOperator) copyDown(reduceSinkOperator, newTopNKeyDesc));
if (topNKeyDesc.getKeyColumns().size() == commonKeyPrefix.size()) {
LOG.debug("Removing {} above {}", topNKey.getName(), join.getName());
join.removeChildAndAdoptItsChildren(topNKey);
}
}
use of org.apache.hadoop.hive.ql.plan.ReduceSinkDesc in project hive by apache.
the class SemanticAnalyzer method genJoinReduceSinkChild.
@SuppressWarnings("nls")
private Operator genJoinReduceSinkChild(ExprNodeDesc[] joinKeys, Operator<?> parent, String[] srcs, int tag) throws SemanticException {
// dummy for backtracking
Operator dummy = Operator.createDummy();
dummy.setParentOperators(Arrays.asList(parent));
RowResolver inputRR = opParseCtx.get(parent).getRowResolver();
RowResolver outputRR = new RowResolver();
List<String> outputColumns = new ArrayList<String>();
List<ExprNodeDesc> reduceKeys = new ArrayList<ExprNodeDesc>();
List<ExprNodeDesc> reduceKeysBack = new ArrayList<ExprNodeDesc>();
// Compute join keys and store in reduceKeys
for (ExprNodeDesc joinKey : joinKeys) {
reduceKeys.add(joinKey);
reduceKeysBack.add(ExprNodeDescUtils.backtrack(joinKey, dummy, parent));
}
// Walk over the input row resolver and copy in the output
ArrayList<ExprNodeDesc> reduceValues = new ArrayList<ExprNodeDesc>();
List<ColumnInfo> columns = inputRR.getColumnInfos();
int[] index = new int[columns.size()];
for (int i = 0; i < columns.size(); i++) {
ColumnInfo colInfo = columns.get(i);
String[] nm = inputRR.reverseLookup(colInfo.getInternalName());
String[] nm2 = inputRR.getAlternateMappings(colInfo.getInternalName());
ExprNodeDesc expr = new ExprNodeColumnDesc(colInfo);
// backtrack can be null when input is script operator
ExprNodeDesc exprBack = ExprNodeDescUtils.backtrack(expr, dummy, parent);
int kindex;
if (exprBack == null) {
kindex = -1;
} else if (ExprNodeDescUtils.isConstant(exprBack)) {
kindex = reduceKeysBack.indexOf(exprBack);
} else {
kindex = ExprNodeDescUtils.indexOf(exprBack, reduceKeysBack);
}
if (kindex >= 0) {
ColumnInfo newColInfo = new ColumnInfo(colInfo);
String internalColName = Utilities.ReduceField.KEY + ".reducesinkkey" + kindex;
newColInfo.setInternalName(internalColName);
newColInfo.setTabAlias(nm[0]);
outputRR.put(nm[0], nm[1], newColInfo);
if (nm2 != null) {
outputRR.addMappingOnly(nm2[0], nm2[1], newColInfo);
}
index[i] = kindex;
continue;
}
index[i] = -reduceValues.size() - 1;
String outputColName = getColumnInternalName(reduceValues.size());
reduceValues.add(expr);
ColumnInfo newColInfo = new ColumnInfo(colInfo);
String internalColName = Utilities.ReduceField.VALUE + "." + outputColName;
newColInfo.setInternalName(internalColName);
newColInfo.setTabAlias(nm[0]);
outputRR.put(nm[0], nm[1], newColInfo);
if (nm2 != null) {
outputRR.addMappingOnly(nm2[0], nm2[1], newColInfo);
}
outputColumns.add(outputColName);
}
dummy.setParentOperators(null);
int numReds = -1;
// Use only 1 reducer in case of cartesian product
if (reduceKeys.size() == 0) {
numReds = 1;
String error = StrictChecks.checkCartesian(conf);
if (error != null) {
throw new SemanticException(error);
}
}
ReduceSinkDesc rsDesc = PlanUtils.getReduceSinkDesc(reduceKeys, reduceValues, outputColumns, false, tag, reduceKeys.size(), numReds, AcidUtils.Operation.NOT_ACID, defaultNullOrder);
Map<String, String> translatorMap = new HashMap<String, String>();
Map<String, ExprNodeDesc> colExprMap = new HashMap<String, ExprNodeDesc>();
List<String> keyColNames = rsDesc.getOutputKeyColumnNames();
for (int i = 0; i < keyColNames.size(); i++) {
String oldName = keyColNames.get(i);
String newName = Utilities.ReduceField.KEY + "." + oldName;
colExprMap.put(newName, reduceKeys.get(i));
translatorMap.put(oldName, newName);
}
List<String> valColNames = rsDesc.getOutputValueColumnNames();
for (int i = 0; i < valColNames.size(); i++) {
String oldName = valColNames.get(i);
String newName = Utilities.ReduceField.VALUE + "." + oldName;
colExprMap.put(newName, reduceValues.get(i));
translatorMap.put(oldName, newName);
}
RowSchema defaultRs = new RowSchema(outputRR.getColumnInfos());
List<ColumnInfo> newColumnInfos = new ArrayList<ColumnInfo>();
for (ColumnInfo ci : outputRR.getColumnInfos()) {
if (translatorMap.containsKey(ci.getInternalName())) {
ci = new ColumnInfo(ci);
ci.setInternalName(translatorMap.get(ci.getInternalName()));
}
newColumnInfos.add(ci);
}
ReduceSinkOperator rsOp = (ReduceSinkOperator) putOpInsertMap(OperatorFactory.getAndMakeChild(rsDesc, new RowSchema(newColumnInfos), parent), outputRR);
rsOp.setValueIndex(index);
rsOp.setColumnExprMap(colExprMap);
rsOp.setInputAliases(srcs);
return rsOp;
}
use of org.apache.hadoop.hive.ql.plan.ReduceSinkDesc in project hive by apache.
the class SetSparkReducerParallelism method process.
@Override
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procContext, Object... nodeOutputs) throws SemanticException {
OptimizeSparkProcContext context = (OptimizeSparkProcContext) procContext;
ReduceSinkOperator sink = (ReduceSinkOperator) nd;
ReduceSinkDesc desc = sink.getConf();
Set<ReduceSinkOperator> parentSinks = null;
int maxReducers = context.getConf().getIntVar(HiveConf.ConfVars.MAXREDUCERS);
int constantReducers = context.getConf().getIntVar(HiveConf.ConfVars.HADOOPNUMREDUCERS);
if (!useOpStats) {
parentSinks = OperatorUtils.findOperatorsUpstream(sink, ReduceSinkOperator.class);
parentSinks.remove(sink);
if (!context.getVisitedReduceSinks().containsAll(parentSinks)) {
// We haven't processed all the parent sinks, and we need
// them to be done in order to compute the parallelism for this sink.
// In this case, skip. We should visit this again from another path.
LOG.debug("Skipping sink " + sink + " for now as we haven't seen all its parents.");
return false;
}
}
if (context.getVisitedReduceSinks().contains(sink)) {
// skip walking the children
LOG.debug("Already processed reduce sink: " + sink.getName());
return true;
}
context.getVisitedReduceSinks().add(sink);
if (needSetParallelism(sink, context.getConf())) {
if (constantReducers > 0) {
LOG.info("Parallelism for reduce sink " + sink + " set by user to " + constantReducers);
desc.setNumReducers(constantReducers);
} else {
// If it's a FileSink to bucketed files, use the bucket count as the reducer number
FileSinkOperator fso = GenSparkUtils.getChildOperator(sink, FileSinkOperator.class);
if (fso != null) {
String bucketCount = fso.getConf().getTableInfo().getProperties().getProperty(hive_metastoreConstants.BUCKET_COUNT);
int numBuckets = bucketCount == null ? 0 : Integer.parseInt(bucketCount);
if (numBuckets > 0) {
LOG.info("Set parallelism for reduce sink " + sink + " to: " + numBuckets + " (buckets)");
desc.setNumReducers(numBuckets);
return false;
}
}
if (useOpStats || parentSinks.isEmpty()) {
long numberOfBytes = 0;
if (useOpStats) {
// we need to add up all the estimates from the siblings of this reduce sink
for (Operator<? extends OperatorDesc> sibling : sink.getChildOperators().get(0).getParentOperators()) {
if (sibling.getStatistics() != null) {
numberOfBytes = StatsUtils.safeAdd(numberOfBytes, sibling.getStatistics().getDataSize());
if (LOG.isDebugEnabled()) {
LOG.debug("Sibling " + sibling + " has stats: " + sibling.getStatistics());
}
} else {
LOG.warn("No stats available from: " + sibling);
}
}
} else {
// we should use TS stats to infer parallelism
for (Operator<? extends OperatorDesc> sibling : sink.getChildOperators().get(0).getParentOperators()) {
Set<TableScanOperator> sources = OperatorUtils.findOperatorsUpstream(sibling, TableScanOperator.class);
for (TableScanOperator source : sources) {
if (source.getStatistics() != null) {
numberOfBytes = StatsUtils.safeAdd(numberOfBytes, source.getStatistics().getDataSize());
if (LOG.isDebugEnabled()) {
LOG.debug("Table source " + source + " has stats: " + source.getStatistics());
}
} else {
LOG.warn("No stats available from table source: " + source);
}
}
}
LOG.debug("Gathered stats for sink " + sink + ". Total size is " + numberOfBytes + " bytes.");
}
// Divide it by 2 so that we can have more reducers
long bytesPerReducer = context.getConf().getLongVar(HiveConf.ConfVars.BYTESPERREDUCER) / 2;
int numReducers = Utilities.estimateReducers(numberOfBytes, bytesPerReducer, maxReducers, false);
getSparkMemoryAndCores(context);
if (sparkMemoryAndCores != null && sparkMemoryAndCores.getLeft() > 0 && sparkMemoryAndCores.getRight() > 0) {
// warn the user if bytes per reducer is much larger than memory per task
if ((double) sparkMemoryAndCores.getLeft() / bytesPerReducer < 0.5) {
LOG.warn("Average load of a reducer is much larger than its available memory. " + "Consider decreasing hive.exec.reducers.bytes.per.reducer");
}
// If there are more cores, use the number of cores
numReducers = Math.max(numReducers, sparkMemoryAndCores.getRight());
}
numReducers = Math.min(numReducers, maxReducers);
LOG.info("Set parallelism for reduce sink " + sink + " to: " + numReducers + " (calculated)");
desc.setNumReducers(numReducers);
} else {
// Use the maximum parallelism from all parent reduce sinks
int numberOfReducers = 0;
for (ReduceSinkOperator parent : parentSinks) {
numberOfReducers = Math.max(numberOfReducers, parent.getConf().getNumReducers());
}
desc.setNumReducers(numberOfReducers);
LOG.debug("Set parallelism for sink " + sink + " to " + numberOfReducers + " based on its parents");
}
final Collection<ExprNodeDesc.ExprNodeDescEqualityWrapper> keyCols = ExprNodeDesc.ExprNodeDescEqualityWrapper.transform(desc.getKeyCols());
final Collection<ExprNodeDesc.ExprNodeDescEqualityWrapper> partCols = ExprNodeDesc.ExprNodeDescEqualityWrapper.transform(desc.getPartitionCols());
if (keyCols != null && keyCols.equals(partCols)) {
desc.setReducerTraits(EnumSet.of(UNIFORM));
}
}
} else {
LOG.info("Number of reducers for sink " + sink + " was already determined to be: " + desc.getNumReducers());
}
return false;
}
use of org.apache.hadoop.hive.ql.plan.ReduceSinkDesc in project hive by apache.
the class TestSharedWorkOptimizer method ensureDeduplicate.
private void ensureDeduplicate(EnumSet<ReduceSinkDesc.ReducerTraits> traits1, int numReducers1, EnumSet<ReduceSinkDesc.ReducerTraits> traits2, int numReducers2, EnumSet<ReduceSinkDesc.ReducerTraits> expectedTraits, int expectedNumReducers) {
ReduceSinkDesc rsConf1;
ReduceSinkDesc rsConf2;
boolean deduplicated;
rsConf1 = new ReduceSinkDesc();
rsConf1.setReducerTraits(traits1);
rsConf1.setNumReducers(numReducers1);
rsConf2 = new ReduceSinkDesc();
rsConf2.setReducerTraits(traits2);
rsConf2.setNumReducers(numReducers2);
deduplicated = SharedWorkOptimizer.deduplicateReduceTraits(rsConf1, rsConf2);
assertTrue(deduplicated);
assertEquals(expectedTraits, rsConf1.getReducerTraits());
assertEquals(expectedNumReducers, rsConf1.getNumReducers());
rsConf1 = new ReduceSinkDesc();
rsConf1.setReducerTraits(traits1);
rsConf1.setNumReducers(numReducers1);
rsConf2 = new ReduceSinkDesc();
rsConf2.setReducerTraits(traits2);
rsConf2.setNumReducers(numReducers2);
deduplicated = SharedWorkOptimizer.deduplicateReduceTraits(rsConf2, rsConf1);
assertTrue(deduplicated);
assertEquals(expectedTraits, rsConf2.getReducerTraits());
assertEquals(expectedNumReducers, rsConf2.getNumReducers());
}
use of org.apache.hadoop.hive.ql.plan.ReduceSinkDesc in project hive by apache.
the class TestSharedWorkOptimizer method ensureNotDeduplicate.
private void ensureNotDeduplicate(EnumSet<ReduceSinkDesc.ReducerTraits> traits1, int numReducers1, EnumSet<ReduceSinkDesc.ReducerTraits> traits2, int numReducers2) {
ReduceSinkDesc rsConf1;
ReduceSinkDesc rsConf2;
boolean deduplicated;
rsConf1 = new ReduceSinkDesc();
rsConf1.setReducerTraits(traits1);
rsConf1.setNumReducers(numReducers1);
rsConf2 = new ReduceSinkDesc();
rsConf2.setReducerTraits(traits2);
rsConf2.setNumReducers(numReducers2);
deduplicated = SharedWorkOptimizer.deduplicateReduceTraits(rsConf1, rsConf2);
assertFalse(deduplicated);
rsConf1 = new ReduceSinkDesc();
rsConf1.setReducerTraits(traits1);
rsConf1.setNumReducers(numReducers1);
rsConf2 = new ReduceSinkDesc();
rsConf2.setReducerTraits(traits2);
rsConf2.setNumReducers(numReducers2);
deduplicated = SharedWorkOptimizer.deduplicateReduceTraits(rsConf1, rsConf2);
assertFalse(deduplicated);
}
Aggregations