use of org.apache.hadoop.hive.ql.optimizer.spark.SparkPartitionPruningSinkDesc in project hive by apache.
the class SparkRemoveDynamicPruningBySize method process.
@Override
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procContext, Object... nodeOutputs) throws SemanticException {
OptimizeSparkProcContext context = (OptimizeSparkProcContext) procContext;
SparkPartitionPruningSinkOperator op = (SparkPartitionPruningSinkOperator) nd;
SparkPartitionPruningSinkDesc desc = op.getConf();
if (desc.getStatistics().getDataSize() > context.getConf().getLongVar(ConfVars.SPARK_DYNAMIC_PARTITION_PRUNING_MAX_DATA_SIZE)) {
OperatorUtils.removeBranch(op);
// at this point we've found the fork in the op pipeline that has the pruning as a child plan.
LOG.info("Disabling dynamic pruning for: " + desc.getTableScan().getName() + ". Expected data size is too big: " + desc.getStatistics().getDataSize());
}
return false;
}
use of org.apache.hadoop.hive.ql.optimizer.spark.SparkPartitionPruningSinkDesc in project hive by apache.
the class SparkCompiler method connect.
private void connect(Operator<?> o, AtomicInteger index, Stack<Operator<?>> nodes, Map<Operator<?>, Integer> indexes, Map<Operator<?>, Integer> lowLinks, Set<Set<Operator<?>>> components) {
indexes.put(o, index.get());
lowLinks.put(o, index.get());
index.incrementAndGet();
nodes.push(o);
List<Operator<?>> children;
if (o instanceof SparkPartitionPruningSinkOperator) {
children = new ArrayList<>();
children.addAll(o.getChildOperators());
TableScanOperator ts = ((SparkPartitionPruningSinkDesc) o.getConf()).getTableScan();
LOG.debug("Adding special edge: " + o.getName() + " --> " + ts.toString());
children.add(ts);
} else {
children = o.getChildOperators();
}
for (Operator<?> child : children) {
if (!indexes.containsKey(child)) {
connect(child, index, nodes, indexes, lowLinks, components);
lowLinks.put(o, Math.min(lowLinks.get(o), lowLinks.get(child)));
} else if (nodes.contains(child)) {
lowLinks.put(o, Math.min(lowLinks.get(o), indexes.get(child)));
}
}
if (lowLinks.get(o).equals(indexes.get(o))) {
Set<Operator<?>> component = new HashSet<Operator<?>>();
components.add(component);
Operator<?> current;
do {
current = nodes.pop();
component.add(current);
} while (current != o);
}
}
use of org.apache.hadoop.hive.ql.optimizer.spark.SparkPartitionPruningSinkDesc in project hive by apache.
the class DynamicPartitionPruningOptimization method generateEventOperatorPlan.
private void generateEventOperatorPlan(DynamicListContext ctx, ParseContext parseContext, TableScanOperator ts, String column, String columnType) {
// we will put a fork in the plan at the source of the reduce sink
Operator<? extends OperatorDesc> parentOfRS = ctx.generator.getParentOperators().get(0);
// we need the expr that generated the key of the reduce sink
ExprNodeDesc key = ctx.generator.getConf().getKeyCols().get(ctx.desc.getKeyIndex());
// we also need the expr for the partitioned table
ExprNodeDesc partKey = ctx.parent.getChildren().get(0);
if (LOG.isDebugEnabled()) {
LOG.debug("key expr: " + key);
LOG.debug("partition key expr: " + partKey);
}
List<ExprNodeDesc> keyExprs = new ArrayList<ExprNodeDesc>();
keyExprs.add(key);
// group by requires "ArrayList", don't ask.
ArrayList<String> outputNames = new ArrayList<String>();
outputNames.add(HiveConf.getColumnInternalName(0));
// project the relevant key column
SelectDesc select = new SelectDesc(keyExprs, outputNames);
SelectOperator selectOp = (SelectOperator) OperatorFactory.getAndMakeChild(select, parentOfRS);
// do a group by on the list to dedup
float groupByMemoryUsage = HiveConf.getFloatVar(parseContext.getConf(), HiveConf.ConfVars.HIVEMAPAGGRHASHMEMORY);
float memoryThreshold = HiveConf.getFloatVar(parseContext.getConf(), HiveConf.ConfVars.HIVEMAPAGGRMEMORYTHRESHOLD);
ArrayList<ExprNodeDesc> groupByExprs = new ArrayList<ExprNodeDesc>();
ExprNodeDesc groupByExpr = new ExprNodeColumnDesc(key.getTypeInfo(), outputNames.get(0), null, false);
groupByExprs.add(groupByExpr);
GroupByDesc groupBy = new GroupByDesc(GroupByDesc.Mode.HASH, outputNames, groupByExprs, new ArrayList<AggregationDesc>(), false, groupByMemoryUsage, memoryThreshold, null, false, 0, true);
GroupByOperator groupByOp = (GroupByOperator) OperatorFactory.getAndMakeChild(groupBy, selectOp);
Map<String, ExprNodeDesc> colMap = new HashMap<String, ExprNodeDesc>();
colMap.put(outputNames.get(0), groupByExpr);
groupByOp.setColumnExprMap(colMap);
// finally add the event broadcast operator
if (HiveConf.getVar(parseContext.getConf(), ConfVars.HIVE_EXECUTION_ENGINE).equals("tez")) {
DynamicPruningEventDesc eventDesc = new DynamicPruningEventDesc();
eventDesc.setTableScan(ts);
eventDesc.setTable(PlanUtils.getReduceValueTableDesc(PlanUtils.getFieldSchemasFromColumnList(keyExprs, "key")));
eventDesc.setTargetColumnName(column);
eventDesc.setTargetColumnType(columnType);
eventDesc.setPartKey(partKey);
OperatorFactory.getAndMakeChild(eventDesc, groupByOp);
} else {
// Must be spark branch
SparkPartitionPruningSinkDesc desc = new SparkPartitionPruningSinkDesc();
desc.setTableScan(ts);
desc.setTable(PlanUtils.getReduceValueTableDesc(PlanUtils.getFieldSchemasFromColumnList(keyExprs, "key")));
desc.setTargetColumnName(column);
desc.setPartKey(partKey);
OperatorFactory.getAndMakeChild(desc, groupByOp);
}
}
use of org.apache.hadoop.hive.ql.optimizer.spark.SparkPartitionPruningSinkDesc in project hive by apache.
the class GenSparkUtils method processPartitionPruningSink.
/**
* Populate partition pruning information from the pruning sink operator to the
* target MapWork (the MapWork for the big table side). The information include the source table
* name, column name, and partition key expression. It also set up the temporary path used to
* communicate between the target MapWork and source BaseWork.
*
* Here "source" refers to the small table side, while "target" refers to the big
* table side.
*
* @param context the spark context.
* @param pruningSink the pruner sink operator being processed.
*/
public void processPartitionPruningSink(GenSparkProcContext context, SparkPartitionPruningSinkOperator pruningSink) {
SparkPartitionPruningSinkDesc desc = pruningSink.getConf();
TableScanOperator ts = desc.getTableScan();
MapWork targetWork = (MapWork) context.rootToWorkMap.get(ts);
Preconditions.checkArgument(targetWork != null, "No targetWork found for tablescan " + ts);
String targetId = SparkUtilities.getWorkId(targetWork);
BaseWork sourceWork = getEnclosingWork(pruningSink, context);
String sourceId = SparkUtilities.getWorkId(sourceWork);
// set up temporary path to communicate between the small/big table
Path tmpPath = targetWork.getTmpPathForPartitionPruning();
if (tmpPath == null) {
Path baseTmpPath = context.parseContext.getContext().getMRTmpPath();
tmpPath = SparkUtilities.generateTmpPathForPartitionPruning(baseTmpPath, targetId);
targetWork.setTmpPathForPartitionPruning(tmpPath);
LOG.info("Setting tmp path between source work and target work:\n" + tmpPath);
}
desc.setPath(new Path(tmpPath, sourceId));
desc.setTargetWork(targetWork.getName());
// store table descriptor in map-targetWork
if (!targetWork.getEventSourceTableDescMap().containsKey(sourceId)) {
targetWork.getEventSourceTableDescMap().put(sourceId, new LinkedList<TableDesc>());
}
List<TableDesc> tables = targetWork.getEventSourceTableDescMap().get(sourceId);
tables.add(pruningSink.getConf().getTable());
// store column name in map-targetWork
if (!targetWork.getEventSourceColumnNameMap().containsKey(sourceId)) {
targetWork.getEventSourceColumnNameMap().put(sourceId, new LinkedList<String>());
}
List<String> columns = targetWork.getEventSourceColumnNameMap().get(sourceId);
columns.add(desc.getTargetColumnName());
// store partition key expr in map-targetWork
if (!targetWork.getEventSourcePartKeyExprMap().containsKey(sourceId)) {
targetWork.getEventSourcePartKeyExprMap().put(sourceId, new LinkedList<ExprNodeDesc>());
}
List<ExprNodeDesc> keys = targetWork.getEventSourcePartKeyExprMap().get(sourceId);
keys.add(desc.getPartKey());
}
use of org.apache.hadoop.hive.ql.optimizer.spark.SparkPartitionPruningSinkDesc in project hive by apache.
the class Vectorizer method vectorizeOperator.
public Operator<? extends OperatorDesc> vectorizeOperator(Operator<? extends OperatorDesc> op, VectorizationContext vContext, boolean isTezOrSpark, VectorTaskColumnInfo vectorTaskColumnInfo) throws HiveException {
Operator<? extends OperatorDesc> vectorOp = null;
boolean isNative;
switch(op.getType()) {
case TABLESCAN:
vectorOp = vectorizeTableScanOperator(op, vContext);
isNative = true;
break;
case MAPJOIN:
{
if (op instanceof MapJoinOperator) {
VectorMapJoinInfo vectorMapJoinInfo = new VectorMapJoinInfo();
MapJoinDesc desc = (MapJoinDesc) op.getConf();
boolean specialize = canSpecializeMapJoin(op, desc, isTezOrSpark, vContext, vectorMapJoinInfo);
if (!specialize) {
Class<? extends Operator<?>> opClass = null;
// *NON-NATIVE* vector map differences for LEFT OUTER JOIN and Filtered...
List<ExprNodeDesc> bigTableFilters = desc.getFilters().get((byte) desc.getPosBigTable());
boolean isOuterAndFiltered = (!desc.isNoOuterJoin() && bigTableFilters.size() > 0);
if (!isOuterAndFiltered) {
opClass = VectorMapJoinOperator.class;
} else {
opClass = VectorMapJoinOuterFilteredOperator.class;
}
vectorOp = OperatorFactory.getVectorOperator(opClass, op.getCompilationOpContext(), op.getConf(), vContext);
isNative = false;
} else {
// TEMPORARY Until Native Vector Map Join with Hybrid passes tests...
// HiveConf.setBoolVar(physicalContext.getConf(),
// HiveConf.ConfVars.HIVEUSEHYBRIDGRACEHASHJOIN, false);
vectorOp = specializeMapJoinOperator(op, vContext, desc, vectorMapJoinInfo);
isNative = true;
if (vectorTaskColumnInfo != null) {
if (usesVectorUDFAdaptor(vectorMapJoinInfo.getBigTableKeyExpressions())) {
vectorTaskColumnInfo.setUsesVectorUDFAdaptor(true);
}
if (usesVectorUDFAdaptor(vectorMapJoinInfo.getBigTableValueExpressions())) {
vectorTaskColumnInfo.setUsesVectorUDFAdaptor(true);
}
}
}
} else {
Preconditions.checkState(op instanceof SMBMapJoinOperator);
SMBJoinDesc smbJoinSinkDesc = (SMBJoinDesc) op.getConf();
VectorSMBJoinDesc vectorSMBJoinDesc = new VectorSMBJoinDesc();
smbJoinSinkDesc.setVectorDesc(vectorSMBJoinDesc);
vectorOp = OperatorFactory.getVectorOperator(op.getCompilationOpContext(), smbJoinSinkDesc, vContext);
isNative = false;
}
}
break;
case REDUCESINK:
{
VectorReduceSinkInfo vectorReduceSinkInfo = new VectorReduceSinkInfo();
ReduceSinkDesc desc = (ReduceSinkDesc) op.getConf();
boolean specialize = canSpecializeReduceSink(desc, isTezOrSpark, vContext, vectorReduceSinkInfo);
if (!specialize) {
vectorOp = OperatorFactory.getVectorOperator(op.getCompilationOpContext(), op.getConf(), vContext);
isNative = false;
} else {
vectorOp = specializeReduceSinkOperator(op, vContext, desc, vectorReduceSinkInfo);
isNative = true;
if (vectorTaskColumnInfo != null) {
if (usesVectorUDFAdaptor(vectorReduceSinkInfo.getReduceSinkKeyExpressions())) {
vectorTaskColumnInfo.setUsesVectorUDFAdaptor(true);
}
if (usesVectorUDFAdaptor(vectorReduceSinkInfo.getReduceSinkValueExpressions())) {
vectorTaskColumnInfo.setUsesVectorUDFAdaptor(true);
}
}
}
}
break;
case FILTER:
{
vectorOp = vectorizeFilterOperator(op, vContext);
isNative = true;
if (vectorTaskColumnInfo != null) {
VectorFilterDesc vectorFilterDesc = (VectorFilterDesc) ((AbstractOperatorDesc) vectorOp.getConf()).getVectorDesc();
VectorExpression vectorPredicateExpr = vectorFilterDesc.getPredicateExpression();
if (usesVectorUDFAdaptor(vectorPredicateExpr)) {
vectorTaskColumnInfo.setUsesVectorUDFAdaptor(true);
}
}
}
break;
case SELECT:
{
vectorOp = vectorizeSelectOperator(op, vContext);
isNative = true;
if (vectorTaskColumnInfo != null) {
VectorSelectDesc vectorSelectDesc = (VectorSelectDesc) ((AbstractOperatorDesc) vectorOp.getConf()).getVectorDesc();
VectorExpression[] vectorSelectExprs = vectorSelectDesc.getSelectExpressions();
if (usesVectorUDFAdaptor(vectorSelectExprs)) {
vectorTaskColumnInfo.setUsesVectorUDFAdaptor(true);
}
}
}
break;
case GROUPBY:
{
vectorOp = vectorizeGroupByOperator(op, vContext);
isNative = false;
if (vectorTaskColumnInfo != null) {
VectorGroupByDesc vectorGroupByDesc = (VectorGroupByDesc) ((AbstractOperatorDesc) vectorOp.getConf()).getVectorDesc();
if (!vectorGroupByDesc.isVectorOutput()) {
vectorTaskColumnInfo.setGroupByVectorOutput(false);
}
VectorExpression[] vecKeyExpressions = vectorGroupByDesc.getKeyExpressions();
if (usesVectorUDFAdaptor(vecKeyExpressions)) {
vectorTaskColumnInfo.setUsesVectorUDFAdaptor(true);
}
VectorAggregateExpression[] vecAggregators = vectorGroupByDesc.getAggregators();
for (VectorAggregateExpression vecAggr : vecAggregators) {
if (usesVectorUDFAdaptor(vecAggr.inputExpression())) {
vectorTaskColumnInfo.setUsesVectorUDFAdaptor(true);
}
}
}
}
break;
case FILESINK:
{
FileSinkDesc fileSinkDesc = (FileSinkDesc) op.getConf();
VectorFileSinkDesc vectorFileSinkDesc = new VectorFileSinkDesc();
fileSinkDesc.setVectorDesc(vectorFileSinkDesc);
vectorOp = OperatorFactory.getVectorOperator(op.getCompilationOpContext(), fileSinkDesc, vContext);
isNative = false;
}
break;
case LIMIT:
{
LimitDesc limitDesc = (LimitDesc) op.getConf();
VectorLimitDesc vectorLimitDesc = new VectorLimitDesc();
limitDesc.setVectorDesc(vectorLimitDesc);
vectorOp = OperatorFactory.getVectorOperator(op.getCompilationOpContext(), limitDesc, vContext);
isNative = true;
}
break;
case EVENT:
{
AppMasterEventDesc eventDesc = (AppMasterEventDesc) op.getConf();
VectorAppMasterEventDesc vectorEventDesc = new VectorAppMasterEventDesc();
eventDesc.setVectorDesc(vectorEventDesc);
vectorOp = OperatorFactory.getVectorOperator(op.getCompilationOpContext(), eventDesc, vContext);
isNative = true;
}
break;
case HASHTABLESINK:
{
SparkHashTableSinkDesc sparkHashTableSinkDesc = (SparkHashTableSinkDesc) op.getConf();
VectorSparkHashTableSinkDesc vectorSparkHashTableSinkDesc = new VectorSparkHashTableSinkDesc();
sparkHashTableSinkDesc.setVectorDesc(vectorSparkHashTableSinkDesc);
vectorOp = OperatorFactory.getVectorOperator(op.getCompilationOpContext(), sparkHashTableSinkDesc, vContext);
isNative = true;
}
break;
case SPARKPRUNINGSINK:
{
SparkPartitionPruningSinkDesc sparkPartitionPruningSinkDesc = (SparkPartitionPruningSinkDesc) op.getConf();
VectorSparkPartitionPruningSinkDesc vectorSparkPartitionPruningSinkDesc = new VectorSparkPartitionPruningSinkDesc();
sparkPartitionPruningSinkDesc.setVectorDesc(vectorSparkPartitionPruningSinkDesc);
vectorOp = OperatorFactory.getVectorOperator(op.getCompilationOpContext(), sparkPartitionPruningSinkDesc, vContext);
isNative = true;
}
break;
default:
// These are children of GROUP BY operators with non-vector outputs.
isNative = false;
vectorOp = op;
break;
}
Preconditions.checkState(vectorOp != null);
if (vectorTaskColumnInfo != null && !isNative) {
vectorTaskColumnInfo.setAllNative(false);
}
LOG.debug("vectorizeOperator " + vectorOp.getClass().getName());
LOG.debug("vectorizeOperator " + vectorOp.getConf().getClass().getName());
if (vectorOp != op) {
fixupParentChildOperators(op, vectorOp);
((AbstractOperatorDesc) vectorOp.getConf()).setVectorMode(true);
}
return vectorOp;
}
Aggregations