use of org.apache.hadoop.hive.ql.plan.StatsNoJobWork in project hive by apache.
the class SparkProcessAnalyzeTable method process.
@SuppressWarnings("unchecked")
@Override
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procContext, Object... nodeOutputs) throws SemanticException {
GenSparkProcContext context = (GenSparkProcContext) procContext;
TableScanOperator tableScan = (TableScanOperator) nd;
ParseContext parseContext = context.parseContext;
@SuppressWarnings("rawtypes") Class<? extends InputFormat> inputFormat = tableScan.getConf().getTableMetadata().getInputFormatClass();
if (parseContext.getQueryProperties().isAnalyzeCommand()) {
Preconditions.checkArgument(tableScan.getChildOperators() == null || tableScan.getChildOperators().size() == 0, "AssertionError: expected tableScan.getChildOperators() to be null, " + "or tableScan.getChildOperators().size() to be 0");
String alias = null;
for (String a : parseContext.getTopOps().keySet()) {
if (tableScan == parseContext.getTopOps().get(a)) {
alias = a;
}
}
Preconditions.checkArgument(alias != null, "AssertionError: expected alias to be not null");
SparkWork sparkWork = context.currentTask.getWork();
boolean partialScan = parseContext.getQueryProperties().isPartialScanAnalyzeCommand();
boolean noScan = parseContext.getQueryProperties().isNoScanAnalyzeCommand();
if (inputFormat.equals(OrcInputFormat.class) && (noScan || partialScan)) {
// ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS partialscan;
// ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS noscan;
// There will not be any Spark job above this task
StatsNoJobWork snjWork = new StatsNoJobWork(tableScan.getConf().getTableMetadata().getTableSpec());
snjWork.setStatsReliable(parseContext.getConf().getBoolVar(HiveConf.ConfVars.HIVE_STATS_RELIABLE));
Task<StatsNoJobWork> snjTask = TaskFactory.get(snjWork, parseContext.getConf());
snjTask.setParentTasks(null);
context.rootTasks.remove(context.currentTask);
context.rootTasks.add(snjTask);
return true;
} else {
// ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS;
// The plan consists of a simple SparkTask followed by a StatsTask.
// The Spark task is just a simple TableScanOperator
StatsWork statsWork = new StatsWork(tableScan.getConf().getTableMetadata().getTableSpec());
statsWork.setAggKey(tableScan.getConf().getStatsAggPrefix());
statsWork.setStatsTmpDir(tableScan.getConf().getTmpStatsDir());
statsWork.setSourceTask(context.currentTask);
statsWork.setStatsReliable(parseContext.getConf().getBoolVar(HiveConf.ConfVars.HIVE_STATS_RELIABLE));
Task<StatsWork> statsTask = TaskFactory.get(statsWork, parseContext.getConf());
context.currentTask.addDependentTask(statsTask);
// The plan consists of a StatsTask only.
if (parseContext.getQueryProperties().isNoScanAnalyzeCommand()) {
statsTask.setParentTasks(null);
statsWork.setNoScanAnalyzeCommand(true);
context.rootTasks.remove(context.currentTask);
context.rootTasks.add(statsTask);
}
// ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS partialscan;
if (parseContext.getQueryProperties().isPartialScanAnalyzeCommand()) {
handlePartialScanCommand(tableScan, parseContext, statsWork, context, statsTask);
}
// NOTE: here we should use the new partition predicate pushdown API to get a list of pruned list,
// and pass it to setTaskPlan as the last parameter
Set<Partition> confirmedPartns = GenMapRedUtils.getConfirmedPartitionsForScan(tableScan);
PrunedPartitionList partitions = null;
if (confirmedPartns.size() > 0) {
Table source = tableScan.getConf().getTableMetadata();
List<String> partCols = GenMapRedUtils.getPartitionColumns(tableScan);
partitions = new PrunedPartitionList(source, confirmedPartns, partCols, false);
}
MapWork w = utils.createMapWork(context, tableScan, sparkWork, partitions);
w.setGatheringStats(true);
return true;
}
}
return null;
}
use of org.apache.hadoop.hive.ql.plan.StatsNoJobWork in project hive by apache.
the class GenMRTableScan1 method process.
/**
* Table Sink encountered.
* @param nd
* the table sink operator encountered
* @param opProcCtx
* context
*/
@Override
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx opProcCtx, Object... nodeOutputs) throws SemanticException {
TableScanOperator op = (TableScanOperator) nd;
GenMRProcContext ctx = (GenMRProcContext) opProcCtx;
ParseContext parseCtx = ctx.getParseCtx();
Class<? extends InputFormat> inputFormat = op.getConf().getTableMetadata().getInputFormatClass();
Map<Operator<? extends OperatorDesc>, GenMapRedCtx> mapCurrCtx = ctx.getMapCurrCtx();
// create a dummy MapReduce task
MapredWork currWork = GenMapRedUtils.getMapRedWork(parseCtx);
MapRedTask currTask = (MapRedTask) TaskFactory.get(currWork, parseCtx.getConf());
ctx.setCurrTask(currTask);
ctx.setCurrTopOp(op);
for (String alias : parseCtx.getTopOps().keySet()) {
Operator<? extends OperatorDesc> currOp = parseCtx.getTopOps().get(alias);
if (currOp == op) {
String currAliasId = alias;
ctx.setCurrAliasId(currAliasId);
mapCurrCtx.put(op, new GenMapRedCtx(currTask, currAliasId));
if (parseCtx.getQueryProperties().isAnalyzeCommand()) {
boolean partialScan = parseCtx.getQueryProperties().isPartialScanAnalyzeCommand();
boolean noScan = parseCtx.getQueryProperties().isNoScanAnalyzeCommand();
if (OrcInputFormat.class.isAssignableFrom(inputFormat) || MapredParquetInputFormat.class.isAssignableFrom(inputFormat)) {
// For ORC and Parquet, all the following statements are the same
// ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS
// ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS partialscan;
// ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS noscan;
// There will not be any MR or Tez job above this task
StatsNoJobWork snjWork = new StatsNoJobWork(op.getConf().getTableMetadata().getTableSpec());
snjWork.setStatsReliable(parseCtx.getConf().getBoolVar(HiveConf.ConfVars.HIVE_STATS_RELIABLE));
// If partition is specified, get pruned partition list
Set<Partition> confirmedParts = GenMapRedUtils.getConfirmedPartitionsForScan(op);
if (confirmedParts.size() > 0) {
Table source = op.getConf().getTableMetadata();
List<String> partCols = GenMapRedUtils.getPartitionColumns(op);
PrunedPartitionList partList = new PrunedPartitionList(source, confirmedParts, partCols, false);
snjWork.setPrunedPartitionList(partList);
}
Task<StatsNoJobWork> snjTask = TaskFactory.get(snjWork, parseCtx.getConf());
ctx.setCurrTask(snjTask);
ctx.setCurrTopOp(null);
ctx.getRootTasks().clear();
ctx.getRootTasks().add(snjTask);
} else {
// ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS;
// The plan consists of a simple MapRedTask followed by a StatsTask.
// The MR task is just a simple TableScanOperator
StatsWork statsWork = new StatsWork(op.getConf().getTableMetadata().getTableSpec());
statsWork.setAggKey(op.getConf().getStatsAggPrefix());
statsWork.setStatsTmpDir(op.getConf().getTmpStatsDir());
statsWork.setSourceTask(currTask);
statsWork.setStatsReliable(parseCtx.getConf().getBoolVar(HiveConf.ConfVars.HIVE_STATS_RELIABLE));
Task<StatsWork> statsTask = TaskFactory.get(statsWork, parseCtx.getConf());
currTask.addDependentTask(statsTask);
if (!ctx.getRootTasks().contains(currTask)) {
ctx.getRootTasks().add(currTask);
}
// The plan consists of a StatsTask only.
if (noScan) {
statsTask.setParentTasks(null);
statsWork.setNoScanAnalyzeCommand(true);
ctx.getRootTasks().remove(currTask);
ctx.getRootTasks().add(statsTask);
}
// ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS partialscan;
if (partialScan) {
handlePartialScanCommand(op, ctx, parseCtx, currTask, statsWork, statsTask);
}
currWork.getMapWork().setGatheringStats(true);
if (currWork.getReduceWork() != null) {
currWork.getReduceWork().setGatheringStats(true);
}
// NOTE: here we should use the new partition predicate pushdown API to get a list of
// pruned list,
// and pass it to setTaskPlan as the last parameter
Set<Partition> confirmedPartns = GenMapRedUtils.getConfirmedPartitionsForScan(op);
if (confirmedPartns.size() > 0) {
Table source = op.getConf().getTableMetadata();
List<String> partCols = GenMapRedUtils.getPartitionColumns(op);
PrunedPartitionList partList = new PrunedPartitionList(source, confirmedPartns, partCols, false);
GenMapRedUtils.setTaskPlan(currAliasId, op, currTask, false, ctx, partList);
} else {
// non-partitioned table
GenMapRedUtils.setTaskPlan(currAliasId, op, currTask, false, ctx);
}
}
}
return true;
}
}
assert false;
return null;
}
use of org.apache.hadoop.hive.ql.plan.StatsNoJobWork in project hive by apache.
the class ProcessAnalyzeTable method process.
@SuppressWarnings("unchecked")
@Override
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procContext, Object... nodeOutputs) throws SemanticException {
GenTezProcContext context = (GenTezProcContext) procContext;
TableScanOperator tableScan = (TableScanOperator) nd;
ParseContext parseContext = context.parseContext;
Class<? extends InputFormat> inputFormat = tableScan.getConf().getTableMetadata().getInputFormatClass();
if (parseContext.getQueryProperties().isAnalyzeCommand()) {
assert tableScan.getChildOperators() == null || tableScan.getChildOperators().size() == 0;
String alias = null;
for (String a : parseContext.getTopOps().keySet()) {
if (tableScan == parseContext.getTopOps().get(a)) {
alias = a;
}
}
assert alias != null;
TezWork tezWork = context.currentTask.getWork();
if (inputFormat.equals(OrcInputFormat.class)) {
// For ORC, all the following statements are the same
// ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS
// ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS partialscan;
// ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS noscan;
// There will not be any Tez job above this task
StatsNoJobWork snjWork = new StatsNoJobWork(tableScan.getConf().getTableMetadata().getTableSpec());
snjWork.setStatsReliable(parseContext.getConf().getBoolVar(HiveConf.ConfVars.HIVE_STATS_RELIABLE));
// If partition is specified, get pruned partition list
Set<Partition> confirmedParts = GenMapRedUtils.getConfirmedPartitionsForScan(tableScan);
if (confirmedParts.size() > 0) {
Table source = tableScan.getConf().getTableMetadata();
List<String> partCols = GenMapRedUtils.getPartitionColumns(tableScan);
PrunedPartitionList partList = new PrunedPartitionList(source, confirmedParts, partCols, false);
snjWork.setPrunedPartitionList(partList);
}
Task<StatsNoJobWork> snjTask = TaskFactory.get(snjWork, parseContext.getConf());
snjTask.setParentTasks(null);
context.rootTasks.remove(context.currentTask);
context.rootTasks.add(snjTask);
return true;
} else {
// ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS;
// The plan consists of a simple TezTask followed by a StatsTask.
// The Tez task is just a simple TableScanOperator
StatsWork statsWork = new StatsWork(tableScan.getConf().getTableMetadata().getTableSpec());
statsWork.setAggKey(tableScan.getConf().getStatsAggPrefix());
statsWork.setStatsTmpDir(tableScan.getConf().getTmpStatsDir());
statsWork.setSourceTask(context.currentTask);
statsWork.setStatsReliable(parseContext.getConf().getBoolVar(HiveConf.ConfVars.HIVE_STATS_RELIABLE));
Task<StatsWork> statsTask = TaskFactory.get(statsWork, parseContext.getConf());
context.currentTask.addDependentTask(statsTask);
// The plan consists of a StatsTask only.
if (parseContext.getQueryProperties().isNoScanAnalyzeCommand()) {
statsTask.setParentTasks(null);
statsWork.setNoScanAnalyzeCommand(true);
context.rootTasks.remove(context.currentTask);
context.rootTasks.add(statsTask);
}
// ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS partialscan;
if (parseContext.getQueryProperties().isPartialScanAnalyzeCommand()) {
handlePartialScanCommand(tableScan, parseContext, statsWork, context, statsTask);
}
// NOTE: here we should use the new partition predicate pushdown API to
// get a list of pruned list,
// and pass it to setTaskPlan as the last parameter
Set<Partition> confirmedPartns = GenMapRedUtils.getConfirmedPartitionsForScan(tableScan);
PrunedPartitionList partitions = null;
if (confirmedPartns.size() > 0) {
Table source = tableScan.getConf().getTableMetadata();
List<String> partCols = GenMapRedUtils.getPartitionColumns(tableScan);
partitions = new PrunedPartitionList(source, confirmedPartns, partCols, false);
}
MapWork w = utils.createMapWork(context, tableScan, tezWork, partitions);
w.setGatheringStats(true);
return true;
}
} else if (parseContext.getAnalyzeRewrite() != null) {
// we need to collect table stats while collecting column stats.
try {
context.currentTask.addDependentTask(genTableStats(context, tableScan));
} catch (HiveException e) {
throw new SemanticException(e);
}
}
return null;
}
use of org.apache.hadoop.hive.ql.plan.StatsNoJobWork in project hive by apache.
the class ProcessAnalyzeTable method genTableStats.
private Task<?> genTableStats(GenTezProcContext context, TableScanOperator tableScan) throws HiveException {
Class<? extends InputFormat> inputFormat = tableScan.getConf().getTableMetadata().getInputFormatClass();
ParseContext parseContext = context.parseContext;
Table table = tableScan.getConf().getTableMetadata();
List<Partition> partitions = new ArrayList<>();
if (table.isPartitioned()) {
partitions.addAll(parseContext.getPrunedPartitions(tableScan).getPartitions());
for (Partition partn : partitions) {
LOG.debug("XXX: adding part: " + partn);
context.outputs.add(new WriteEntity(partn, WriteEntity.WriteType.DDL_NO_LOCK));
}
}
TableSpec tableSpec = new TableSpec(table, partitions);
tableScan.getConf().getTableMetadata().setTableSpec(tableSpec);
if (inputFormat.equals(OrcInputFormat.class)) {
// For ORC, there is no Tez Job for table stats.
StatsNoJobWork snjWork = new StatsNoJobWork(tableScan.getConf().getTableMetadata().getTableSpec());
snjWork.setStatsReliable(parseContext.getConf().getBoolVar(HiveConf.ConfVars.HIVE_STATS_RELIABLE));
// If partition is specified, get pruned partition list
if (partitions.size() > 0) {
snjWork.setPrunedPartitionList(parseContext.getPrunedPartitions(tableScan));
}
return TaskFactory.get(snjWork, parseContext.getConf());
} else {
StatsWork statsWork = new StatsWork(tableScan.getConf().getTableMetadata().getTableSpec());
statsWork.setAggKey(tableScan.getConf().getStatsAggPrefix());
statsWork.setStatsTmpDir(tableScan.getConf().getTmpStatsDir());
statsWork.setSourceTask(context.currentTask);
statsWork.setStatsReliable(parseContext.getConf().getBoolVar(HiveConf.ConfVars.HIVE_STATS_RELIABLE));
return TaskFactory.get(statsWork, parseContext.getConf());
}
}
Aggregations