use of org.apache.hadoop.hive.ql.parse.spark.OptimizeSparkProcContext in project hive by apache.
the class SparkRemoveDynamicPruningBySize method process.
@Override
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procContext, Object... nodeOutputs) throws SemanticException {
OptimizeSparkProcContext context = (OptimizeSparkProcContext) procContext;
SparkPartitionPruningSinkOperator op = (SparkPartitionPruningSinkOperator) nd;
SparkPartitionPruningSinkDesc desc = op.getConf();
if (desc.getStatistics().getDataSize() > context.getConf().getLongVar(ConfVars.SPARK_DYNAMIC_PARTITION_PRUNING_MAX_DATA_SIZE)) {
OperatorUtils.removeBranch(op);
// at this point we've found the fork in the op pipeline that has the pruning as a child plan.
LOG.info("Disabling dynamic pruning for: " + desc.getTableScan().getName() + ". Expected data size is too big: " + desc.getStatistics().getDataSize());
}
return false;
}
use of org.apache.hadoop.hive.ql.parse.spark.OptimizeSparkProcContext in project hive by apache.
the class SparkJoinHintOptimizer method process.
@Override
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx, Object... nodeOutputs) throws SemanticException {
OptimizeSparkProcContext context = (OptimizeSparkProcContext) procCtx;
HiveConf hiveConf = context.getParseContext().getConf();
// Convert from mapjoin to bucket map join if enabled.
if (hiveConf.getBoolVar(HiveConf.ConfVars.HIVEOPTBUCKETMAPJOIN) || hiveConf.getBoolVar(HiveConf.ConfVars.HIVEOPTSORTMERGEBUCKETMAPJOIN)) {
BucketJoinProcCtx bjProcCtx = new BucketJoinProcCtx(hiveConf);
bucketMapJoinOptimizer.process(nd, stack, bjProcCtx, nodeOutputs);
}
// Convert from bucket map join to sort merge bucket map join if enabled.
if (hiveConf.getBoolVar(HiveConf.ConfVars.HIVEOPTSORTMERGEBUCKETMAPJOIN)) {
SortBucketJoinProcCtx smbJoinCtx = new SortBucketJoinProcCtx(hiveConf);
smbMapJoinOptimizer.process(nd, stack, smbJoinCtx, nodeOutputs);
}
return null;
}
use of org.apache.hadoop.hive.ql.parse.spark.OptimizeSparkProcContext in project hive by apache.
the class SparkMapJoinOptimizer method process.
@Override
public /**
* We should ideally not modify the tree we traverse. However,
* since we need to walk the tree at any time when we modify the operator, we
* might as well do it here.
*/
Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx, Object... nodeOutputs) throws SemanticException {
OptimizeSparkProcContext context = (OptimizeSparkProcContext) procCtx;
HiveConf conf = context.getConf();
JoinOperator joinOp = (JoinOperator) nd;
if (!conf.getBoolVar(HiveConf.ConfVars.HIVECONVERTJOIN)) {
return null;
}
LOG.info("Check if it can be converted to map join");
long[] mapJoinInfo = getMapJoinConversionInfo(joinOp, context);
int mapJoinConversionPos = (int) mapJoinInfo[0];
if (mapJoinConversionPos < 0) {
return null;
}
int numBuckets = -1;
List<List<String>> bucketColNames = null;
LOG.info("Convert to non-bucketed map join");
MapJoinOperator mapJoinOp = convertJoinMapJoin(joinOp, context, mapJoinConversionPos);
// but changing SerDe won't hurt correctness
if (conf.getBoolVar(HiveConf.ConfVars.HIVE_VECTORIZATION_MAPJOIN_NATIVE_ENABLED) && conf.getBoolVar(HiveConf.ConfVars.HIVE_VECTORIZATION_ENABLED)) {
mapJoinOp.getConf().getKeyTblDesc().getProperties().setProperty(serdeConstants.SERIALIZATION_LIB, BinarySortableSerDe.class.getName());
}
if (conf.getBoolVar(HiveConf.ConfVars.HIVEOPTBUCKETMAPJOIN)) {
LOG.info("Check if it can be converted to bucketed map join");
numBuckets = convertJoinBucketMapJoin(joinOp, mapJoinOp, context, mapJoinConversionPos);
if (numBuckets > 1) {
LOG.info("Converted to map join with " + numBuckets + " buckets");
bucketColNames = joinOp.getOpTraits().getBucketColNames();
mapJoinInfo[2] /= numBuckets;
} else {
LOG.info("Can not convert to bucketed map join");
}
}
// we can set the traits for this join operator
OpTraits opTraits = new OpTraits(bucketColNames, numBuckets, null, joinOp.getOpTraits().getNumReduceSinks());
mapJoinOp.setOpTraits(opTraits);
mapJoinOp.setStatistics(joinOp.getStatistics());
setNumberOfBucketsOnChildren(mapJoinOp);
context.getMjOpSizes().put(mapJoinOp, mapJoinInfo[1] + mapJoinInfo[2]);
return mapJoinOp;
}
use of org.apache.hadoop.hive.ql.parse.spark.OptimizeSparkProcContext in project hive by apache.
the class DynamicPartitionPruningOptimization method process.
@Override
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx, Object... nodeOutputs) throws SemanticException {
ParseContext parseContext;
if (procCtx instanceof OptimizeTezProcContext) {
parseContext = ((OptimizeTezProcContext) procCtx).parseContext;
} else if (procCtx instanceof OptimizeSparkProcContext) {
parseContext = ((OptimizeSparkProcContext) procCtx).getParseContext();
} else {
throw new IllegalArgumentException("expected parseContext to be either " + "OptimizeTezProcContext or OptimizeSparkProcContext, but found " + procCtx.getClass().getName());
}
FilterOperator filter = (FilterOperator) nd;
FilterDesc desc = filter.getConf();
if (!parseContext.getConf().getBoolVar(ConfVars.TEZ_DYNAMIC_PARTITION_PRUNING) && !parseContext.getConf().getBoolVar(ConfVars.SPARK_DYNAMIC_PARTITION_PRUNING)) {
// nothing to do when the optimization is off
return null;
}
TableScanOperator ts = null;
if (filter.getParentOperators().size() == 1 && filter.getParentOperators().get(0) instanceof TableScanOperator) {
ts = (TableScanOperator) filter.getParentOperators().get(0);
}
if (LOG.isDebugEnabled()) {
LOG.debug("Parent: " + filter.getParentOperators().get(0));
LOG.debug("Filter: " + desc.getPredicateString());
LOG.debug("TableScan: " + ts);
}
DynamicPartitionPrunerContext removerContext = new DynamicPartitionPrunerContext();
// collect the dynamic pruning conditions
removerContext.dynLists.clear();
collectDynamicPruningConditions(desc.getPredicate(), removerContext);
if (ts == null) {
// Replace the synthetic predicate with true and bail out
for (DynamicListContext ctx : removerContext) {
ExprNodeDesc constNode = new ExprNodeConstantDesc(ctx.parent.getTypeInfo(), true);
replaceExprNode(ctx, desc, constNode);
}
return false;
}
final boolean semiJoin = parseContext.getConf().getBoolVar(ConfVars.TEZ_DYNAMIC_SEMIJOIN_REDUCTION);
for (DynamicListContext ctx : removerContext) {
String column = ExprNodeDescUtils.extractColName(ctx.parent);
boolean semiJoinAttempted = false;
if (column != null) {
// Need unique IDs to refer to each min/max key value in the DynamicValueRegistry
String keyBaseAlias = "";
Table table = ts.getConf().getTableMetadata();
if (table != null && table.isPartitionKey(column)) {
String columnType = table.getPartColByName(column).getType();
String alias = ts.getConf().getAlias();
PrunedPartitionList plist = parseContext.getPrunedPartitions(alias, ts);
if (LOG.isDebugEnabled()) {
LOG.debug("alias: " + alias);
LOG.debug("pruned partition list: ");
if (plist != null) {
for (Partition p : plist.getPartitions()) {
LOG.debug(p.getCompleteName());
}
}
}
// have been already filtered
if (plist == null || plist.getPartitions().size() != 0) {
LOG.info("Dynamic partitioning: " + table.getCompleteName() + "." + column);
generateEventOperatorPlan(ctx, parseContext, ts, column, columnType);
} else {
// all partitions have been statically removed
LOG.debug("No partition pruning necessary.");
}
} else {
LOG.debug("Column " + column + " is not a partition column");
if (semiJoin && ts.getConf().getFilterExpr() != null) {
LOG.debug("Initiate semijoin reduction for " + column);
// Get the table name from which the min-max values will come.
Operator<?> op = ctx.generator;
while (!(op == null || op instanceof TableScanOperator)) {
op = op.getParentOperators().get(0);
}
String tableAlias = (op == null ? "" : ((TableScanOperator) op).getConf().getAlias());
keyBaseAlias = ctx.generator.getOperatorId() + "_" + tableAlias + "_" + column;
semiJoinAttempted = generateSemiJoinOperatorPlan(ctx, parseContext, ts, keyBaseAlias);
}
}
// we always remove the condition by replacing it with "true"
if (semiJoinAttempted) {
List<ExprNodeDesc> betweenArgs = new ArrayList<ExprNodeDesc>();
// Do not invert between result
betweenArgs.add(new ExprNodeConstantDesc(Boolean.FALSE));
// add column expression here
betweenArgs.add(ctx.parent.getChildren().get(0));
betweenArgs.add(new ExprNodeDynamicValueDesc(new DynamicValue(keyBaseAlias + "_min", ctx.desc.getTypeInfo())));
betweenArgs.add(new ExprNodeDynamicValueDesc(new DynamicValue(keyBaseAlias + "_max", ctx.desc.getTypeInfo())));
ExprNodeDesc betweenNode = ExprNodeGenericFuncDesc.newInstance(FunctionRegistry.getFunctionInfo("between").getGenericUDF(), betweenArgs);
// add column expression for bloom filter
List<ExprNodeDesc> bloomFilterArgs = new ArrayList<ExprNodeDesc>();
bloomFilterArgs.add(ctx.parent.getChildren().get(0));
bloomFilterArgs.add(new ExprNodeDynamicValueDesc(new DynamicValue(keyBaseAlias + "_bloom_filter", TypeInfoFactory.binaryTypeInfo)));
ExprNodeDesc bloomFilterNode = ExprNodeGenericFuncDesc.newInstance(FunctionRegistry.getFunctionInfo("in_bloom_filter").getGenericUDF(), bloomFilterArgs);
List<ExprNodeDesc> andArgs = new ArrayList<ExprNodeDesc>();
andArgs.add(betweenNode);
andArgs.add(bloomFilterNode);
ExprNodeDesc andExpr = ExprNodeGenericFuncDesc.newInstance(FunctionRegistry.getFunctionInfo("and").getGenericUDF(), andArgs);
replaceExprNode(ctx, desc, andExpr);
} else {
ExprNodeDesc replaceNode = new ExprNodeConstantDesc(ctx.parent.getTypeInfo(), true);
replaceExprNode(ctx, desc, replaceNode);
}
} else {
ExprNodeDesc constNode = new ExprNodeConstantDesc(ctx.parent.getTypeInfo(), true);
replaceExprNode(ctx, desc, constNode);
}
}
// if we pushed the predicate into the table scan we need to remove the
// synthetic conditions there.
cleanTableScanFilters(ts);
return false;
}
use of org.apache.hadoop.hive.ql.parse.spark.OptimizeSparkProcContext in project hive by apache.
the class SetSparkReducerParallelism method process.
@Override
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procContext, Object... nodeOutputs) throws SemanticException {
OptimizeSparkProcContext context = (OptimizeSparkProcContext) procContext;
ReduceSinkOperator sink = (ReduceSinkOperator) nd;
ReduceSinkDesc desc = sink.getConf();
Set<ReduceSinkOperator> parentSinks = null;
int maxReducers = context.getConf().getIntVar(HiveConf.ConfVars.MAXREDUCERS);
int constantReducers = context.getConf().getIntVar(HiveConf.ConfVars.HADOOPNUMREDUCERS);
if (!useOpStats) {
parentSinks = OperatorUtils.findOperatorsUpstream(sink, ReduceSinkOperator.class);
parentSinks.remove(sink);
if (!context.getVisitedReduceSinks().containsAll(parentSinks)) {
// We haven't processed all the parent sinks, and we need
// them to be done in order to compute the parallelism for this sink.
// In this case, skip. We should visit this again from another path.
LOG.debug("Skipping sink " + sink + " for now as we haven't seen all its parents.");
return false;
}
}
if (context.getVisitedReduceSinks().contains(sink)) {
// skip walking the children
LOG.debug("Already processed reduce sink: " + sink.getName());
return true;
}
context.getVisitedReduceSinks().add(sink);
if (needSetParallelism(sink, context.getConf())) {
if (constantReducers > 0) {
LOG.info("Parallelism for reduce sink " + sink + " set by user to " + constantReducers);
desc.setNumReducers(constantReducers);
} else {
//If it's a FileSink to bucketed files, use the bucket count as the reducer number
FileSinkOperator fso = GenSparkUtils.getChildOperator(sink, FileSinkOperator.class);
if (fso != null) {
String bucketCount = fso.getConf().getTableInfo().getProperties().getProperty(hive_metastoreConstants.BUCKET_COUNT);
int numBuckets = bucketCount == null ? 0 : Integer.parseInt(bucketCount);
if (numBuckets > 0) {
LOG.info("Set parallelism for reduce sink " + sink + " to: " + numBuckets + " (buckets)");
desc.setNumReducers(numBuckets);
return false;
}
}
long numberOfBytes = 0;
if (useOpStats) {
// we need to add up all the estimates from the siblings of this reduce sink
for (Operator<? extends OperatorDesc> sibling : sink.getChildOperators().get(0).getParentOperators()) {
if (sibling.getStatistics() != null) {
numberOfBytes += sibling.getStatistics().getDataSize();
if (LOG.isDebugEnabled()) {
LOG.debug("Sibling " + sibling + " has stats: " + sibling.getStatistics());
}
} else {
LOG.warn("No stats available from: " + sibling);
}
}
} else if (parentSinks.isEmpty()) {
// we should use TS stats to infer parallelism
for (Operator<? extends OperatorDesc> sibling : sink.getChildOperators().get(0).getParentOperators()) {
Set<TableScanOperator> sources = OperatorUtils.findOperatorsUpstream(sibling, TableScanOperator.class);
for (TableScanOperator source : sources) {
if (source.getStatistics() != null) {
numberOfBytes += source.getStatistics().getDataSize();
if (LOG.isDebugEnabled()) {
LOG.debug("Table source " + source + " has stats: " + source.getStatistics());
}
} else {
LOG.warn("No stats available from table source: " + source);
}
}
}
LOG.debug("Gathered stats for sink " + sink + ". Total size is " + numberOfBytes + " bytes.");
} else {
// Use the maximum parallelism from all parent reduce sinks
int numberOfReducers = 0;
for (ReduceSinkOperator parent : parentSinks) {
numberOfReducers = Math.max(numberOfReducers, parent.getConf().getNumReducers());
}
desc.setNumReducers(numberOfReducers);
LOG.debug("Set parallelism for sink " + sink + " to " + numberOfReducers + " based on its parents");
return false;
}
// Divide it by 2 so that we can have more reducers
long bytesPerReducer = context.getConf().getLongVar(HiveConf.ConfVars.BYTESPERREDUCER) / 2;
int numReducers = Utilities.estimateReducers(numberOfBytes, bytesPerReducer, maxReducers, false);
getSparkMemoryAndCores(context);
if (sparkMemoryAndCores != null && sparkMemoryAndCores.getFirst() > 0 && sparkMemoryAndCores.getSecond() > 0) {
// warn the user if bytes per reducer is much larger than memory per task
if ((double) sparkMemoryAndCores.getFirst() / bytesPerReducer < 0.5) {
LOG.warn("Average load of a reducer is much larger than its available memory. " + "Consider decreasing hive.exec.reducers.bytes.per.reducer");
}
// If there are more cores, use the number of cores
numReducers = Math.max(numReducers, sparkMemoryAndCores.getSecond());
}
numReducers = Math.min(numReducers, maxReducers);
LOG.info("Set parallelism for reduce sink " + sink + " to: " + numReducers + " (calculated)");
desc.setNumReducers(numReducers);
}
} else {
LOG.info("Number of reducers for sink " + sink + " was already determined to be: " + desc.getNumReducers());
}
return false;
}
Aggregations